1 /*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_acpi.h" 31 #ifdef __i386__ 32 #include "opt_apic.h" 33 #endif 34 #include "opt_cpu.h" 35 #include "opt_ddb.h" 36 #include "opt_gdb.h" 37 #include "opt_kstack_pages.h" 38 #include "opt_pmap.h" 39 #include "opt_sched.h" 40 #include "opt_smp.h" 41 #include "opt_stack.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/asan.h> 46 #include <sys/bus.h> 47 #include <sys/cons.h> /* cngetc() */ 48 #include <sys/cpuset.h> 49 #include <sys/csan.h> 50 #include <sys/interrupt.h> 51 #include <sys/kdb.h> 52 #include <sys/kernel.h> 53 #include <sys/ktr.h> 54 #include <sys/lock.h> 55 #include <sys/malloc.h> 56 #include <sys/memrange.h> 57 #include <sys/mutex.h> 58 #include <sys/pcpu.h> 59 #include <sys/proc.h> 60 #include <sys/sched.h> 61 #include <sys/smp.h> 62 #include <sys/sysctl.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_param.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_kern.h> 68 #include <vm/vm_extern.h> 69 #include <vm/vm_map.h> 70 71 #include <x86/apicreg.h> 72 #include <machine/clock.h> 73 #include <machine/cpu.h> 74 #include <machine/cputypes.h> 75 #include <x86/mca.h> 76 #include <machine/md_var.h> 77 #include <machine/pcb.h> 78 #include <machine/psl.h> 79 #include <machine/smp.h> 80 #include <machine/specialreg.h> 81 #include <machine/stack.h> 82 #include <x86/ucode.h> 83 84 #ifdef DEV_ACPI 85 #include <contrib/dev/acpica/include/acpi.h> 86 #include <dev/acpica/acpivar.h> 87 #endif 88 89 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); 90 91 int mp_naps; /* # of Applications processors */ 92 int boot_cpu_id = -1; /* designated BSP */ 93 94 /* AP uses this during bootstrap. Do not staticize. */ 95 char *bootSTK; 96 int bootAP; 97 98 /* Free these after use */ 99 void *bootstacks[MAXCPU]; 100 void *dpcpu; 101 102 struct susppcb **susppcbs; 103 104 #ifdef COUNT_IPIS 105 /* Interrupt counts. */ 106 static u_long *ipi_preempt_counts[MAXCPU]; 107 static u_long *ipi_ast_counts[MAXCPU]; 108 u_long *ipi_invltlb_counts[MAXCPU]; 109 u_long *ipi_invlrng_counts[MAXCPU]; 110 u_long *ipi_invlpg_counts[MAXCPU]; 111 u_long *ipi_invlcache_counts[MAXCPU]; 112 u_long *ipi_rendezvous_counts[MAXCPU]; 113 static u_long *ipi_hardclock_counts[MAXCPU]; 114 #endif 115 116 /* Default cpu_ops implementation. */ 117 struct cpu_ops cpu_ops; 118 119 /* 120 * Local data and functions. 121 */ 122 123 static volatile cpuset_t ipi_stop_nmi_pending; 124 125 volatile cpuset_t resuming_cpus; 126 volatile cpuset_t toresume_cpus; 127 128 /* used to hold the AP's until we are ready to release them */ 129 struct mtx ap_boot_mtx; 130 131 /* Set to 1 once we're ready to let the APs out of the pen. */ 132 volatile int aps_ready = 0; 133 134 /* 135 * Store data from cpu_add() until later in the boot when we actually setup 136 * the APs. 137 */ 138 struct cpu_info *cpu_info; 139 int *apic_cpuids; 140 int cpu_apic_ids[MAXCPU]; 141 _Static_assert(MAXCPU <= MAX_APIC_ID, 142 "MAXCPU cannot be larger that MAX_APIC_ID"); 143 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, 144 "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); 145 146 static void release_aps(void *dummy); 147 static void cpustop_handler_post(u_int cpu); 148 149 static int hyperthreading_allowed = 1; 150 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 151 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 152 153 static int hyperthreading_intr_allowed = 0; 154 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, 155 &hyperthreading_intr_allowed, 0, 156 "Allow interrupts on HTT logical CPUs"); 157 158 static struct topo_node topo_root; 159 160 static int pkg_id_shift; 161 static int node_id_shift; 162 static int core_id_shift; 163 static int disabled_cpus; 164 165 struct cache_info { 166 int id_shift; 167 int present; 168 } static caches[MAX_CACHE_LEVELS]; 169 170 static bool stop_mwait = false; 171 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, 172 "Use MONITOR/MWAIT when stopping CPU, if available"); 173 174 void 175 mem_range_AP_init(void) 176 { 177 178 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 179 mem_range_softc.mr_op->initAP(&mem_range_softc); 180 } 181 182 /* 183 * Round up to the next power of two, if necessary, and then 184 * take log2. 185 * Returns -1 if argument is zero. 186 */ 187 static __inline int 188 mask_width(u_int x) 189 { 190 191 return (fls(x << (1 - powerof2(x))) - 1); 192 } 193 194 /* 195 * Add a cache level to the cache topology description. 196 */ 197 static int 198 add_deterministic_cache(int type, int level, int share_count) 199 { 200 201 if (type == 0) 202 return (0); 203 if (type > 3) { 204 printf("unexpected cache type %d\n", type); 205 return (1); 206 } 207 if (type == 2) /* ignore instruction cache */ 208 return (1); 209 if (level == 0 || level > MAX_CACHE_LEVELS) { 210 printf("unexpected cache level %d\n", level); 211 return (1); 212 } 213 214 if (caches[level - 1].present) { 215 printf("WARNING: multiple entries for L%u data cache\n", level); 216 printf("%u => %u\n", caches[level - 1].id_shift, 217 mask_width(share_count)); 218 } 219 caches[level - 1].id_shift = mask_width(share_count); 220 caches[level - 1].present = 1; 221 222 if (caches[level - 1].id_shift > pkg_id_shift) { 223 printf("WARNING: L%u data cache covers more " 224 "APIC IDs than a package (%u > %u)\n", level, 225 caches[level - 1].id_shift, pkg_id_shift); 226 caches[level - 1].id_shift = pkg_id_shift; 227 } 228 if (caches[level - 1].id_shift < core_id_shift) { 229 printf("WARNING: L%u data cache covers fewer " 230 "APIC IDs than a core (%u < %u)\n", level, 231 caches[level - 1].id_shift, core_id_shift); 232 caches[level - 1].id_shift = core_id_shift; 233 } 234 235 return (1); 236 } 237 238 /* 239 * Determine topology of processing units and caches for AMD CPUs. 240 * See: 241 * - AMD CPUID Specification (Publication # 25481) 242 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 243 * - BKDG For AMD Family 10h Processors (Publication # 31116) 244 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 245 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 246 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) 247 */ 248 static void 249 topo_probe_amd(void) 250 { 251 u_int p[4]; 252 uint64_t v; 253 int level; 254 int nodes_per_socket; 255 int share_count; 256 int type; 257 int i; 258 259 /* No multi-core capability. */ 260 if ((amd_feature2 & AMDID2_CMP) == 0) 261 return; 262 263 /* For families 10h and newer. */ 264 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 265 AMDID_COREID_SIZE_SHIFT; 266 267 /* For 0Fh family. */ 268 if (pkg_id_shift == 0) 269 pkg_id_shift = 270 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 271 272 /* 273 * Families prior to 16h define the following value as 274 * cores per compute unit and we don't really care about the AMD 275 * compute units at the moment. Perhaps we should treat them as 276 * cores and cores within the compute units as hardware threads, 277 * but that's up for debate. 278 * Later families define the value as threads per compute unit, 279 * so we are following AMD's nomenclature here. 280 */ 281 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 282 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 283 cpuid_count(0x8000001e, 0, p); 284 share_count = ((p[1] >> 8) & 0xff) + 1; 285 core_id_shift = mask_width(share_count); 286 287 /* 288 * For Zen (17h), gather Nodes per Processor. Each node is a 289 * Zeppelin die; TR and EPYC CPUs will have multiple dies per 290 * package. Communication latency between dies is higher than 291 * within them. 292 */ 293 nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; 294 node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); 295 } 296 297 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 298 for (i = 0; ; i++) { 299 cpuid_count(0x8000001d, i, p); 300 type = p[0] & 0x1f; 301 level = (p[0] >> 5) & 0x7; 302 share_count = 1 + ((p[0] >> 14) & 0xfff); 303 304 if (!add_deterministic_cache(type, level, share_count)) 305 break; 306 } 307 } else { 308 if (cpu_exthigh >= 0x80000005) { 309 cpuid_count(0x80000005, 0, p); 310 if (((p[2] >> 24) & 0xff) != 0) { 311 caches[0].id_shift = 0; 312 caches[0].present = 1; 313 } 314 } 315 if (cpu_exthigh >= 0x80000006) { 316 cpuid_count(0x80000006, 0, p); 317 if (((p[2] >> 16) & 0xffff) != 0) { 318 caches[1].id_shift = 0; 319 caches[1].present = 1; 320 } 321 if (((p[3] >> 18) & 0x3fff) != 0) { 322 nodes_per_socket = 1; 323 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 324 /* 325 * Handle multi-node processors that 326 * have multiple chips, each with its 327 * own L3 cache, on the same die. 328 */ 329 v = rdmsr(0xc001100c); 330 nodes_per_socket = 1 + ((v >> 3) & 0x7); 331 } 332 caches[2].id_shift = 333 pkg_id_shift - mask_width(nodes_per_socket); 334 caches[2].present = 1; 335 } 336 } 337 } 338 } 339 340 /* 341 * Determine topology of processing units for Intel CPUs 342 * using CPUID Leaf 1 and Leaf 4, if supported. 343 * See: 344 * - Intel 64 Architecture Processor Topology Enumeration 345 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 346 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 347 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 348 */ 349 static void 350 topo_probe_intel_0x4(void) 351 { 352 u_int p[4]; 353 int max_cores; 354 int max_logical; 355 356 /* Both zero and one here mean one logical processor per package. */ 357 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 358 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 359 if (max_logical <= 1) 360 return; 361 362 if (cpu_high >= 0x4) { 363 cpuid_count(0x04, 0, p); 364 max_cores = ((p[0] >> 26) & 0x3f) + 1; 365 } else 366 max_cores = 1; 367 368 core_id_shift = mask_width(max_logical/max_cores); 369 KASSERT(core_id_shift >= 0, 370 ("intel topo: max_cores > max_logical\n")); 371 pkg_id_shift = core_id_shift + mask_width(max_cores); 372 } 373 374 /* 375 * Determine topology of processing units for Intel CPUs 376 * using CPUID Leaf 1Fh or 0Bh, if supported. 377 * See: 378 * - Intel 64 Architecture Processor Topology Enumeration 379 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 380 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 381 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 382 */ 383 static void 384 topo_probe_intel_0xb(void) 385 { 386 u_int leaf; 387 u_int p[4] = { 0 }; 388 int bits; 389 int type; 390 int i; 391 392 /* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */ 393 if (cpu_high >= 0x1f) { 394 leaf = 0x1f; 395 cpuid_count(leaf, 0, p); 396 } 397 /* Fall back to leaf 0Bh (Extended Topology Enumeration). */ 398 if (p[1] == 0) { 399 leaf = 0x0b; 400 cpuid_count(leaf, 0, p); 401 } 402 /* Fall back to leaf 04h (Deterministic Cache Parameters). */ 403 if (p[1] == 0) { 404 topo_probe_intel_0x4(); 405 return; 406 } 407 408 /* We only support three levels for now. */ 409 for (i = 0; ; i++) { 410 cpuid_count(leaf, i, p); 411 412 bits = p[0] & 0x1f; 413 type = (p[2] >> 8) & 0xff; 414 415 if (type == 0) 416 break; 417 418 if (type == CPUID_TYPE_SMT) 419 core_id_shift = bits; 420 else if (type == CPUID_TYPE_CORE) 421 pkg_id_shift = bits; 422 else if (bootverbose) 423 printf("Topology level type %d shift: %d\n", type, bits); 424 } 425 426 if (pkg_id_shift < core_id_shift) { 427 printf("WARNING: core covers more APIC IDs than a package\n"); 428 core_id_shift = pkg_id_shift; 429 } 430 } 431 432 /* 433 * Determine topology of caches for Intel CPUs. 434 * See: 435 * - Intel 64 Architecture Processor Topology Enumeration 436 * - Intel 64 and IA-32 Architectures Software Developer’s Manual 437 * Volume 2A: Instruction Set Reference, A-M, 438 * CPUID instruction 439 */ 440 static void 441 topo_probe_intel_caches(void) 442 { 443 u_int p[4]; 444 int level; 445 int share_count; 446 int type; 447 int i; 448 449 if (cpu_high < 0x4) { 450 /* 451 * Available cache level and sizes can be determined 452 * via CPUID leaf 2, but that requires a huge table of hardcoded 453 * values, so for now just assume L1 and L2 caches potentially 454 * shared only by HTT processing units, if HTT is present. 455 */ 456 caches[0].id_shift = pkg_id_shift; 457 caches[0].present = 1; 458 caches[1].id_shift = pkg_id_shift; 459 caches[1].present = 1; 460 return; 461 } 462 463 for (i = 0; ; i++) { 464 cpuid_count(0x4, i, p); 465 type = p[0] & 0x1f; 466 level = (p[0] >> 5) & 0x7; 467 share_count = 1 + ((p[0] >> 14) & 0xfff); 468 469 if (!add_deterministic_cache(type, level, share_count)) 470 break; 471 } 472 } 473 474 /* 475 * Determine topology of processing units and caches for Intel CPUs. 476 * See: 477 * - Intel 64 Architecture Processor Topology Enumeration 478 */ 479 static void 480 topo_probe_intel(void) 481 { 482 483 /* 484 * Note that 0x1 <= cpu_high < 4 case should be 485 * compatible with topo_probe_intel_0x4() logic when 486 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 487 * or it should trigger the fallback otherwise. 488 */ 489 if (cpu_high >= 0xb) 490 topo_probe_intel_0xb(); 491 else if (cpu_high >= 0x1) 492 topo_probe_intel_0x4(); 493 494 topo_probe_intel_caches(); 495 } 496 497 /* 498 * Topology information is queried only on BSP, on which this 499 * code runs and for which it can query CPUID information. 500 * Then topology is extrapolated on all packages using an 501 * assumption that APIC ID to hardware component ID mapping is 502 * homogenious. 503 * That doesn't necesserily imply that the topology is uniform. 504 */ 505 void 506 topo_probe(void) 507 { 508 static int cpu_topo_probed = 0; 509 struct x86_topo_layer { 510 int type; 511 int subtype; 512 int id_shift; 513 } topo_layers[MAX_CACHE_LEVELS + 5]; 514 struct topo_node *parent; 515 struct topo_node *node; 516 int layer; 517 int nlayers; 518 int node_id; 519 int i; 520 #if defined(DEV_ACPI) && MAXMEMDOM > 1 521 int d, domain; 522 #endif 523 524 if (cpu_topo_probed) 525 return; 526 527 CPU_ZERO(&logical_cpus_mask); 528 529 if (mp_ncpus <= 1) 530 ; /* nothing */ 531 else if (cpu_vendor_id == CPU_VENDOR_AMD || 532 cpu_vendor_id == CPU_VENDOR_HYGON) 533 topo_probe_amd(); 534 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 535 topo_probe_intel(); 536 537 KASSERT(pkg_id_shift >= core_id_shift, 538 ("bug in APIC topology discovery")); 539 540 nlayers = 0; 541 bzero(topo_layers, sizeof(topo_layers)); 542 543 topo_layers[nlayers].type = TOPO_TYPE_PKG; 544 topo_layers[nlayers].id_shift = pkg_id_shift; 545 if (bootverbose) 546 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 547 nlayers++; 548 549 if (pkg_id_shift > node_id_shift && node_id_shift != 0) { 550 topo_layers[nlayers].type = TOPO_TYPE_GROUP; 551 topo_layers[nlayers].id_shift = node_id_shift; 552 if (bootverbose) 553 printf("Node ID shift: %u\n", 554 topo_layers[nlayers].id_shift); 555 nlayers++; 556 } 557 558 /* 559 * Consider all caches to be within a package/chip 560 * and "in front" of all sub-components like 561 * cores and hardware threads. 562 */ 563 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 564 if (caches[i].present) { 565 if (node_id_shift != 0) 566 KASSERT(caches[i].id_shift <= node_id_shift, 567 ("bug in APIC topology discovery")); 568 KASSERT(caches[i].id_shift <= pkg_id_shift, 569 ("bug in APIC topology discovery")); 570 KASSERT(caches[i].id_shift >= core_id_shift, 571 ("bug in APIC topology discovery")); 572 573 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 574 topo_layers[nlayers].subtype = i + 1; 575 topo_layers[nlayers].id_shift = caches[i].id_shift; 576 if (bootverbose) 577 printf("L%u cache ID shift: %u\n", 578 topo_layers[nlayers].subtype, 579 topo_layers[nlayers].id_shift); 580 nlayers++; 581 } 582 } 583 584 if (pkg_id_shift > core_id_shift) { 585 topo_layers[nlayers].type = TOPO_TYPE_CORE; 586 topo_layers[nlayers].id_shift = core_id_shift; 587 if (bootverbose) 588 printf("Core ID shift: %u\n", 589 topo_layers[nlayers].id_shift); 590 nlayers++; 591 } 592 593 topo_layers[nlayers].type = TOPO_TYPE_PU; 594 topo_layers[nlayers].id_shift = 0; 595 nlayers++; 596 597 #if defined(DEV_ACPI) && MAXMEMDOM > 1 598 if (vm_ndomains > 1) { 599 for (layer = 0; layer < nlayers; ++layer) { 600 for (i = 0; i <= max_apic_id; ++i) { 601 if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0) 602 domain = -1; 603 if (!cpu_info[i].cpu_present) 604 continue; 605 d = acpi_pxm_get_cpu_locality(i); 606 if (domain >= 0 && domain != d) 607 break; 608 domain = d; 609 } 610 if (i > max_apic_id) 611 break; 612 } 613 KASSERT(layer < nlayers, ("NUMA domain smaller than PU")); 614 memmove(&topo_layers[layer+1], &topo_layers[layer], 615 sizeof(*topo_layers) * (nlayers - layer)); 616 topo_layers[layer].type = TOPO_TYPE_NODE; 617 topo_layers[layer].subtype = CG_SHARE_NONE; 618 nlayers++; 619 } 620 #endif 621 622 topo_init_root(&topo_root); 623 for (i = 0; i <= max_apic_id; ++i) { 624 if (!cpu_info[i].cpu_present) 625 continue; 626 627 parent = &topo_root; 628 for (layer = 0; layer < nlayers; ++layer) { 629 #if defined(DEV_ACPI) && MAXMEMDOM > 1 630 if (topo_layers[layer].type == TOPO_TYPE_NODE) { 631 node_id = acpi_pxm_get_cpu_locality(i); 632 } else 633 #endif 634 node_id = i >> topo_layers[layer].id_shift; 635 parent = topo_add_node_by_hwid(parent, node_id, 636 topo_layers[layer].type, 637 topo_layers[layer].subtype); 638 } 639 } 640 641 parent = &topo_root; 642 for (layer = 0; layer < nlayers; ++layer) { 643 #if defined(DEV_ACPI) && MAXMEMDOM > 1 644 if (topo_layers[layer].type == TOPO_TYPE_NODE) 645 node_id = acpi_pxm_get_cpu_locality(boot_cpu_id); 646 else 647 #endif 648 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 649 node = topo_find_node_by_hwid(parent, node_id, 650 topo_layers[layer].type, 651 topo_layers[layer].subtype); 652 topo_promote_child(node); 653 parent = node; 654 } 655 656 cpu_topo_probed = 1; 657 } 658 659 /* 660 * Assign logical CPU IDs to local APICs. 661 */ 662 void 663 assign_cpu_ids(void) 664 { 665 struct topo_node *node; 666 u_int smt_mask; 667 int nhyper; 668 669 smt_mask = (1u << core_id_shift) - 1; 670 671 /* 672 * Assign CPU IDs to local APIC IDs and disable any CPUs 673 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 674 */ 675 mp_ncpus = 0; 676 nhyper = 0; 677 TOPO_FOREACH(node, &topo_root) { 678 if (node->type != TOPO_TYPE_PU) 679 continue; 680 681 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 682 cpu_info[node->hwid].cpu_hyperthread = 1; 683 684 if (resource_disabled("lapic", node->hwid)) { 685 if (node->hwid != boot_cpu_id) 686 cpu_info[node->hwid].cpu_disabled = 1; 687 else 688 printf("Cannot disable BSP, APIC ID = %d\n", 689 node->hwid); 690 } 691 692 if (!hyperthreading_allowed && 693 cpu_info[node->hwid].cpu_hyperthread) 694 cpu_info[node->hwid].cpu_disabled = 1; 695 696 if (mp_ncpus >= MAXCPU) 697 cpu_info[node->hwid].cpu_disabled = 1; 698 699 if (cpu_info[node->hwid].cpu_disabled) { 700 disabled_cpus++; 701 continue; 702 } 703 704 if (cpu_info[node->hwid].cpu_hyperthread) 705 nhyper++; 706 707 cpu_apic_ids[mp_ncpus] = node->hwid; 708 apic_cpuids[node->hwid] = mp_ncpus; 709 topo_set_pu_id(node, mp_ncpus); 710 mp_ncpus++; 711 } 712 713 KASSERT(mp_maxid >= mp_ncpus - 1, 714 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 715 mp_ncpus)); 716 717 mp_ncores = mp_ncpus - nhyper; 718 smp_threads_per_core = mp_ncpus / mp_ncores; 719 } 720 721 /* 722 * Print various information about the SMP system hardware and setup. 723 */ 724 void 725 cpu_mp_announce(void) 726 { 727 struct topo_node *node; 728 const char *hyperthread; 729 struct topo_analysis topology; 730 731 printf("FreeBSD/SMP: "); 732 if (topo_analyze(&topo_root, 1, &topology)) { 733 printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); 734 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 735 printf(" x %d groups", 736 topology.entities[TOPO_LEVEL_GROUP]); 737 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 738 printf(" x %d cache groups", 739 topology.entities[TOPO_LEVEL_CACHEGROUP]); 740 if (topology.entities[TOPO_LEVEL_CORE] > 0) 741 printf(" x %d core(s)", 742 topology.entities[TOPO_LEVEL_CORE]); 743 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 744 printf(" x %d hardware threads", 745 topology.entities[TOPO_LEVEL_THREAD]); 746 } else { 747 printf("Non-uniform topology"); 748 } 749 printf("\n"); 750 751 if (disabled_cpus) { 752 printf("FreeBSD/SMP Online: "); 753 if (topo_analyze(&topo_root, 0, &topology)) { 754 printf("%d package(s)", 755 topology.entities[TOPO_LEVEL_PKG]); 756 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 757 printf(" x %d groups", 758 topology.entities[TOPO_LEVEL_GROUP]); 759 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 760 printf(" x %d cache groups", 761 topology.entities[TOPO_LEVEL_CACHEGROUP]); 762 if (topology.entities[TOPO_LEVEL_CORE] > 0) 763 printf(" x %d core(s)", 764 topology.entities[TOPO_LEVEL_CORE]); 765 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 766 printf(" x %d hardware threads", 767 topology.entities[TOPO_LEVEL_THREAD]); 768 } else { 769 printf("Non-uniform topology"); 770 } 771 printf("\n"); 772 } 773 774 if (!bootverbose) 775 return; 776 777 TOPO_FOREACH(node, &topo_root) { 778 switch (node->type) { 779 case TOPO_TYPE_PKG: 780 printf("Package HW ID = %u\n", node->hwid); 781 break; 782 case TOPO_TYPE_CORE: 783 printf("\tCore HW ID = %u\n", node->hwid); 784 break; 785 case TOPO_TYPE_PU: 786 if (cpu_info[node->hwid].cpu_hyperthread) 787 hyperthread = "/HT"; 788 else 789 hyperthread = ""; 790 791 if (node->subtype == 0) 792 printf("\t\tCPU (AP%s): APIC ID: %u" 793 "(disabled)\n", hyperthread, node->hwid); 794 else if (node->id == 0) 795 printf("\t\tCPU0 (BSP): APIC ID: %u\n", 796 node->hwid); 797 else 798 printf("\t\tCPU%u (AP%s): APIC ID: %u\n", 799 node->id, hyperthread, node->hwid); 800 break; 801 default: 802 /* ignored */ 803 break; 804 } 805 } 806 } 807 808 /* 809 * Add a scheduling group, a group of logical processors sharing 810 * a particular cache (and, thus having an affinity), to the scheduling 811 * topology. 812 * This function recursively works on lower level caches. 813 */ 814 static void 815 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 816 { 817 struct topo_node *node; 818 int nchildren; 819 int ncores; 820 int i; 821 822 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || 823 root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP, 824 ("x86topo_add_sched_group: bad type: %u", root->type)); 825 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 826 cg_root->cg_count = root->cpu_count; 827 if (root->type == TOPO_TYPE_CACHE) 828 cg_root->cg_level = root->subtype; 829 else 830 cg_root->cg_level = CG_SHARE_NONE; 831 if (root->type == TOPO_TYPE_NODE) 832 cg_root->cg_flags = CG_FLAG_NODE; 833 else 834 cg_root->cg_flags = 0; 835 836 /* 837 * Check how many core nodes we have under the given root node. 838 * If we have multiple logical processors, but not multiple 839 * cores, then those processors must be hardware threads. 840 */ 841 ncores = 0; 842 node = root; 843 while (node != NULL) { 844 if (node->type != TOPO_TYPE_CORE) { 845 node = topo_next_node(root, node); 846 continue; 847 } 848 849 ncores++; 850 node = topo_next_nonchild_node(root, node); 851 } 852 853 if (cg_root->cg_level != CG_SHARE_NONE && 854 root->cpu_count > 1 && ncores < 2) 855 cg_root->cg_flags |= CG_FLAG_SMT; 856 857 /* 858 * Find out how many cache nodes we have under the given root node. 859 * We ignore cache nodes that cover all the same processors as the 860 * root node. Also, we do not descend below found cache nodes. 861 * That is, we count top-level "non-redundant" caches under the root 862 * node. 863 */ 864 nchildren = 0; 865 node = root; 866 while (node != NULL) { 867 /* 868 * When some APICs are disabled by tunables, nodes can end up 869 * with an empty cpuset. Nodes with an empty cpuset will be 870 * translated into cpu groups with empty cpusets. smp_topo_fill 871 * will then set cg_first and cg_last to -1. This isn't 872 * correctly handled in all functions. E.g. when 873 * cpu_search_lowest and cpu_search_highest loop through all 874 * cpus, they call CPU_ISSET on cpu -1 which ends up in a 875 * general protection fault. 876 * 877 * We could fix the scheduler to handle empty cpu groups 878 * correctly. Nevertheless, empty cpu groups are causing 879 * overhead for no value. So, it makes more sense to just don't 880 * create them. 881 */ 882 if (CPU_EMPTY(&node->cpuset)) { 883 node = topo_next_node(root, node); 884 continue; 885 } 886 if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) { 887 if (node->type == TOPO_TYPE_CACHE && 888 cg_root->cg_level < node->subtype) 889 cg_root->cg_level = node->subtype; 890 if (node->type == TOPO_TYPE_NODE) 891 cg_root->cg_flags |= CG_FLAG_NODE; 892 node = topo_next_node(root, node); 893 continue; 894 } 895 if (node->type != TOPO_TYPE_GROUP && 896 node->type != TOPO_TYPE_NODE && 897 node->type != TOPO_TYPE_CACHE) { 898 node = topo_next_node(root, node); 899 continue; 900 } 901 nchildren++; 902 node = topo_next_nonchild_node(root, node); 903 } 904 905 /* 906 * We are not interested in nodes including only one CPU each. 907 */ 908 if (nchildren == root->cpu_count) 909 return; 910 911 /* 912 * We are not interested in nodes without children. 913 */ 914 cg_root->cg_children = nchildren; 915 if (nchildren == 0) 916 return; 917 918 cg_root->cg_child = smp_topo_alloc(nchildren); 919 920 /* 921 * Now find again the same cache nodes as above and recursively 922 * build scheduling topologies for them. 923 */ 924 node = root; 925 i = 0; 926 while (node != NULL) { 927 if ((node->type != TOPO_TYPE_GROUP && 928 node->type != TOPO_TYPE_NODE && 929 node->type != TOPO_TYPE_CACHE) || 930 CPU_CMP(&node->cpuset, &root->cpuset) == 0 || 931 CPU_EMPTY(&node->cpuset)) { 932 node = topo_next_node(root, node); 933 continue; 934 } 935 cg_root->cg_child[i].cg_parent = cg_root; 936 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 937 i++; 938 node = topo_next_nonchild_node(root, node); 939 } 940 } 941 942 /* 943 * Build the MI scheduling topology from the discovered hardware topology. 944 */ 945 struct cpu_group * 946 cpu_topo(void) 947 { 948 struct cpu_group *cg_root; 949 950 if (mp_ncpus <= 1) 951 return (smp_topo_none()); 952 953 cg_root = smp_topo_alloc(1); 954 x86topo_add_sched_group(&topo_root, cg_root); 955 return (cg_root); 956 } 957 958 static void 959 cpu_alloc(void *dummy __unused) 960 { 961 /* 962 * Dynamically allocate the arrays that depend on the 963 * maximum APIC ID. 964 */ 965 cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, 966 M_WAITOK | M_ZERO); 967 apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, 968 M_WAITOK | M_ZERO); 969 } 970 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); 971 972 /* 973 * Add a logical CPU to the topology. 974 */ 975 void 976 cpu_add(u_int apic_id, char boot_cpu) 977 { 978 979 if (apic_id > max_apic_id) 980 panic("SMP: APIC ID %d too high", apic_id); 981 982 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", 983 apic_id)); 984 cpu_info[apic_id].cpu_present = 1; 985 if (boot_cpu) { 986 KASSERT(boot_cpu_id == -1, 987 ("CPU %u claims to be BSP, but CPU %u already is", apic_id, 988 boot_cpu_id)); 989 boot_cpu_id = apic_id; 990 cpu_info[apic_id].cpu_bsp = 1; 991 } 992 if (bootverbose) 993 printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : 994 "AP"); 995 } 996 997 void 998 cpu_mp_setmaxid(void) 999 { 1000 1001 /* 1002 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 1003 * If there were no calls to cpu_add() assume this is a UP system. 1004 */ 1005 if (mp_ncpus == 0) 1006 mp_ncpus = 1; 1007 } 1008 1009 int 1010 cpu_mp_probe(void) 1011 { 1012 1013 /* 1014 * Always record BSP in CPU map so that the mbuf init code works 1015 * correctly. 1016 */ 1017 CPU_SETOF(0, &all_cpus); 1018 return (mp_ncpus > 1); 1019 } 1020 1021 /* 1022 * AP CPU's call this to initialize themselves. 1023 */ 1024 void 1025 init_secondary_tail(void) 1026 { 1027 u_int cpuid; 1028 1029 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); 1030 1031 /* 1032 * On real hardware, switch to x2apic mode if possible. Do it 1033 * after aps_ready was signalled, to avoid manipulating the 1034 * mode while BSP might still want to send some IPI to us 1035 * (second startup IPI is ignored on modern hardware etc). 1036 */ 1037 lapic_xapic_mode(); 1038 1039 /* Initialize the PAT MSR. */ 1040 pmap_init_pat(); 1041 1042 /* set up CPU registers and state */ 1043 cpu_setregs(); 1044 1045 /* set up SSE/NX */ 1046 initializecpu(); 1047 1048 /* set up FPU state on the AP */ 1049 #ifdef __amd64__ 1050 fpuinit(); 1051 #else 1052 npxinit(false); 1053 #endif 1054 1055 if (cpu_ops.cpu_init) 1056 cpu_ops.cpu_init(); 1057 1058 /* A quick check from sanity claus */ 1059 cpuid = PCPU_GET(cpuid); 1060 if (PCPU_GET(apic_id) != lapic_id()) { 1061 printf("SMP: cpuid = %d\n", cpuid); 1062 printf("SMP: actual apic_id = %d\n", lapic_id()); 1063 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 1064 panic("cpuid mismatch! boom!!"); 1065 } 1066 1067 /* Initialize curthread. */ 1068 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 1069 PCPU_SET(curthread, PCPU_GET(idlethread)); 1070 schedinit_ap(); 1071 1072 mtx_lock_spin(&ap_boot_mtx); 1073 1074 mca_init(); 1075 1076 /* Init local apic for irq's */ 1077 lapic_setup(1); 1078 1079 /* Set memory range attributes for this CPU to match the BSP */ 1080 mem_range_AP_init(); 1081 1082 smp_cpus++; 1083 1084 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 1085 if (bootverbose) 1086 printf("SMP: AP CPU #%d Launched!\n", cpuid); 1087 else 1088 printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", 1089 cpuid, smp_cpus == mp_ncpus ? "\n" : " "); 1090 1091 /* Determine if we are a logical CPU. */ 1092 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 1093 CPU_SET(cpuid, &logical_cpus_mask); 1094 1095 if (bootverbose) 1096 lapic_dump("AP"); 1097 1098 if (smp_cpus == mp_ncpus) { 1099 /* enable IPI's, tlb shootdown, freezes etc */ 1100 atomic_store_rel_int(&smp_started, 1); 1101 } 1102 1103 #ifdef __amd64__ 1104 if (pmap_pcid_enabled) 1105 load_cr4(rcr4() | CR4_PCIDE); 1106 load_ds(_udatasel); 1107 load_es(_udatasel); 1108 load_fs(_ufssel); 1109 #endif 1110 1111 mtx_unlock_spin(&ap_boot_mtx); 1112 1113 /* Wait until all the AP's are up. */ 1114 while (atomic_load_acq_int(&smp_started) == 0) 1115 ia32_pause(); 1116 1117 #ifndef EARLY_AP_STARTUP 1118 /* Start per-CPU event timers. */ 1119 cpu_initclocks_ap(); 1120 #endif 1121 1122 kcsan_cpu_init(cpuid); 1123 1124 sched_ap_entry(); 1125 1126 panic("scheduler returned us to %s", __func__); 1127 /* NOTREACHED */ 1128 } 1129 1130 static void 1131 smp_after_idle_runnable(void *arg __unused) 1132 { 1133 int cpu; 1134 1135 if (mp_ncpus == 1) 1136 return; 1137 1138 KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__)); 1139 1140 /* 1141 * Wait for all APs to handle an interrupt. After that, we know that 1142 * the APs have entered the scheduler at least once, so the boot stacks 1143 * are safe to free. 1144 */ 1145 smp_rendezvous(smp_no_rendezvous_barrier, NULL, 1146 smp_no_rendezvous_barrier, NULL); 1147 1148 for (cpu = 1; cpu < mp_ncpus; cpu++) { 1149 kmem_free(bootstacks[cpu], kstack_pages * PAGE_SIZE); 1150 } 1151 } 1152 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, 1153 smp_after_idle_runnable, NULL); 1154 1155 /* 1156 * We tell the I/O APIC code about all the CPUs we want to receive 1157 * interrupts. If we don't want certain CPUs to receive IRQs we 1158 * can simply not tell the I/O APIC code about them in this function. 1159 * We also do not tell it about the BSP since it tells itself about 1160 * the BSP internally to work with UP kernels and on UP machines. 1161 */ 1162 void 1163 set_interrupt_apic_ids(void) 1164 { 1165 u_int i, apic_id; 1166 1167 for (i = 0; i < MAXCPU; i++) { 1168 apic_id = cpu_apic_ids[i]; 1169 if (apic_id == -1) 1170 continue; 1171 if (cpu_info[apic_id].cpu_bsp) 1172 continue; 1173 if (cpu_info[apic_id].cpu_disabled) 1174 continue; 1175 1176 /* Don't let hyperthreads service interrupts. */ 1177 if (cpu_info[apic_id].cpu_hyperthread && 1178 !hyperthreading_intr_allowed) 1179 continue; 1180 1181 intr_add_cpu(i); 1182 } 1183 } 1184 1185 #ifdef COUNT_XINVLTLB_HITS 1186 u_int xhits_gbl[MAXCPU]; 1187 u_int xhits_pg[MAXCPU]; 1188 u_int xhits_rng[MAXCPU]; 1189 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1190 ""); 1191 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1192 sizeof(xhits_gbl), "IU", ""); 1193 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1194 sizeof(xhits_pg), "IU", ""); 1195 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1196 sizeof(xhits_rng), "IU", ""); 1197 1198 u_int ipi_global; 1199 u_int ipi_page; 1200 u_int ipi_range; 1201 u_int ipi_range_size; 1202 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1203 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1204 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1205 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1206 0, ""); 1207 #endif /* COUNT_XINVLTLB_HITS */ 1208 1209 /* 1210 * Init and startup IPI. 1211 */ 1212 void 1213 ipi_startup(int apic_id, int vector) 1214 { 1215 1216 /* 1217 * This attempts to follow the algorithm described in the 1218 * Intel Multiprocessor Specification v1.4 in section B.4. 1219 * For each IPI, we allow the local APIC ~20us to deliver the 1220 * IPI. If that times out, we panic. 1221 */ 1222 1223 /* 1224 * first we do an INIT IPI: this INIT IPI might be run, resetting 1225 * and running the target CPU. OR this INIT IPI might be latched (P5 1226 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1227 * ignored. 1228 */ 1229 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1230 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1231 lapic_ipi_wait(100); 1232 1233 /* Explicitly deassert the INIT IPI. */ 1234 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1235 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1236 apic_id); 1237 1238 DELAY(10000); /* wait ~10mS */ 1239 1240 /* 1241 * next we do a STARTUP IPI: the previous INIT IPI might still be 1242 * latched, (P5 bug) this 1st STARTUP would then terminate 1243 * immediately, and the previously started INIT IPI would continue. OR 1244 * the previous INIT IPI has already run. and this STARTUP IPI will 1245 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1246 * will run. 1247 */ 1248 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1249 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1250 vector, apic_id); 1251 if (!lapic_ipi_wait(100)) 1252 panic("Failed to deliver first STARTUP IPI to APIC %d", 1253 apic_id); 1254 DELAY(200); /* wait ~200uS */ 1255 1256 /* 1257 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1258 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1259 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1260 * recognized after hardware RESET or INIT IPI. 1261 */ 1262 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1263 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1264 vector, apic_id); 1265 if (!lapic_ipi_wait(100)) 1266 panic("Failed to deliver second STARTUP IPI to APIC %d", 1267 apic_id); 1268 1269 DELAY(200); /* wait ~200uS */ 1270 } 1271 1272 static bool 1273 ipi_bitmap_set(int cpu, u_int ipi) 1274 { 1275 u_int bitmap, old, new; 1276 u_int *cpu_bitmap; 1277 1278 bitmap = 1 << ipi; 1279 cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap; 1280 old = *cpu_bitmap; 1281 for (;;) { 1282 if ((old & bitmap) != 0) 1283 break; 1284 new = old | bitmap; 1285 if (atomic_fcmpset_int(cpu_bitmap, &old, new)) 1286 break; 1287 } 1288 return (old != 0); 1289 } 1290 1291 /* 1292 * Send an IPI to specified CPU handling the bitmap logic. 1293 */ 1294 static void 1295 ipi_send_cpu(int cpu, u_int ipi) 1296 { 1297 1298 KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1, 1299 ("IPI to non-existent CPU %d", cpu)); 1300 1301 if (IPI_IS_BITMAPED(ipi)) { 1302 if (ipi_bitmap_set(cpu, ipi)) 1303 return; 1304 ipi = IPI_BITMAP_VECTOR; 1305 } 1306 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1307 } 1308 1309 void 1310 ipi_bitmap_handler(struct trapframe frame) 1311 { 1312 struct trapframe *oldframe; 1313 struct thread *td; 1314 int cpu = PCPU_GET(cpuid); 1315 u_int ipi_bitmap; 1316 1317 kasan_mark(&frame, sizeof(frame), sizeof(frame), 0); 1318 1319 td = curthread; 1320 ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]-> 1321 pc_ipi_bitmap); 1322 1323 /* 1324 * sched_preempt() must be called to clear the pending preempt 1325 * IPI to enable delivery of further preempts. However, the 1326 * critical section will cause extra scheduler lock thrashing 1327 * when used unconditionally. Only critical_enter() if 1328 * hardclock must also run, which requires the section entry. 1329 */ 1330 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1331 critical_enter(); 1332 1333 td->td_intr_nesting_level++; 1334 oldframe = td->td_intr_frame; 1335 td->td_intr_frame = &frame; 1336 #if defined(STACK) || defined(DDB) 1337 if (ipi_bitmap & (1 << IPI_TRACE)) 1338 stack_capture_intr(); 1339 #endif 1340 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1341 #ifdef COUNT_IPIS 1342 (*ipi_preempt_counts[cpu])++; 1343 #endif 1344 sched_preempt(td); 1345 } 1346 if (ipi_bitmap & (1 << IPI_AST)) { 1347 #ifdef COUNT_IPIS 1348 (*ipi_ast_counts[cpu])++; 1349 #endif 1350 /* Nothing to do for AST */ 1351 } 1352 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1353 #ifdef COUNT_IPIS 1354 (*ipi_hardclock_counts[cpu])++; 1355 #endif 1356 hardclockintr(); 1357 } 1358 td->td_intr_frame = oldframe; 1359 td->td_intr_nesting_level--; 1360 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1361 critical_exit(); 1362 } 1363 1364 /* 1365 * send an IPI to a set of cpus. 1366 */ 1367 void 1368 ipi_selected(cpuset_t cpus, u_int ipi) 1369 { 1370 int cpu; 1371 1372 /* 1373 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1374 * of help in order to understand what is the source. 1375 * Set the mask of receiving CPUs for this purpose. 1376 */ 1377 if (ipi == IPI_STOP_HARD) 1378 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1379 1380 CPU_FOREACH_ISSET(cpu, &cpus) { 1381 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1382 ipi_send_cpu(cpu, ipi); 1383 } 1384 } 1385 1386 /* 1387 * send an IPI to a specific CPU. 1388 */ 1389 void 1390 ipi_cpu(int cpu, u_int ipi) 1391 { 1392 1393 /* 1394 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1395 * of help in order to understand what is the source. 1396 * Set the mask of receiving CPUs for this purpose. 1397 */ 1398 if (ipi == IPI_STOP_HARD) 1399 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1400 1401 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1402 ipi_send_cpu(cpu, ipi); 1403 } 1404 1405 /* 1406 * send an IPI to all CPUs EXCEPT myself 1407 */ 1408 void 1409 ipi_all_but_self(u_int ipi) 1410 { 1411 cpuset_t other_cpus; 1412 int cpu, c; 1413 1414 /* 1415 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1416 * of help in order to understand what is the source. 1417 * Set the mask of receiving CPUs for this purpose. 1418 */ 1419 if (ipi == IPI_STOP_HARD) { 1420 other_cpus = all_cpus; 1421 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1422 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1423 } 1424 1425 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1426 if (IPI_IS_BITMAPED(ipi)) { 1427 cpu = PCPU_GET(cpuid); 1428 CPU_FOREACH(c) { 1429 if (c != cpu) 1430 ipi_bitmap_set(c, ipi); 1431 } 1432 ipi = IPI_BITMAP_VECTOR; 1433 } 1434 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1435 } 1436 1437 void 1438 ipi_self_from_nmi(u_int vector) 1439 { 1440 1441 lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF); 1442 1443 /* Wait for IPI to finish. */ 1444 if (!lapic_ipi_wait(50000)) { 1445 if (KERNEL_PANICKED()) 1446 return; 1447 else 1448 panic("APIC: IPI is stuck"); 1449 } 1450 } 1451 1452 int 1453 ipi_nmi_handler(void) 1454 { 1455 u_int cpuid; 1456 1457 /* 1458 * As long as there is not a simple way to know about a NMI's 1459 * source, if the bitmask for the current CPU is present in 1460 * the global pending bitword an IPI_STOP_HARD has been issued 1461 * and should be handled. 1462 */ 1463 cpuid = PCPU_GET(cpuid); 1464 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1465 return (1); 1466 1467 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1468 cpustop_handler(); 1469 return (0); 1470 } 1471 1472 int nmi_kdb_lock; 1473 1474 void 1475 nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1476 { 1477 int cpu; 1478 bool call_post; 1479 1480 cpu = PCPU_GET(cpuid); 1481 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1482 nmi_call_kdb(cpu, type, frame); 1483 call_post = false; 1484 } else { 1485 savectx(&stoppcbs[cpu]); 1486 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1487 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1488 ia32_pause(); 1489 call_post = true; 1490 } 1491 atomic_store_rel_int(&nmi_kdb_lock, 0); 1492 if (call_post) 1493 cpustop_handler_post(cpu); 1494 } 1495 1496 /* 1497 * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, 1498 * if available) until we are resumed. 1499 */ 1500 void 1501 cpustop_handler(void) 1502 { 1503 struct monitorbuf *mb; 1504 u_int cpu; 1505 bool use_mwait; 1506 1507 cpu = PCPU_GET(cpuid); 1508 1509 savectx(&stoppcbs[cpu]); 1510 1511 use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && 1512 !mwait_cpustop_broken); 1513 if (use_mwait) { 1514 mb = PCPU_PTR(monitorbuf); 1515 atomic_store_int(&mb->stop_state, 1516 MONITOR_STOPSTATE_STOPPED); 1517 } 1518 1519 /* Indicate that we are stopped */ 1520 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1521 1522 /* Wait for restart */ 1523 while (!CPU_ISSET(cpu, &started_cpus)) { 1524 if (use_mwait) { 1525 cpu_monitor(mb, 0, 0); 1526 if (atomic_load_int(&mb->stop_state) == 1527 MONITOR_STOPSTATE_STOPPED) 1528 cpu_mwait(0, MWAIT_C1); 1529 continue; 1530 } 1531 1532 ia32_pause(); 1533 1534 /* 1535 * Halt non-BSP CPUs on panic -- we're never going to need them 1536 * again, and might as well save power / release resources 1537 * (e.g., overprovisioned VM infrastructure). 1538 */ 1539 while (__predict_false(!IS_BSP() && KERNEL_PANICKED())) 1540 halt(); 1541 } 1542 1543 cpustop_handler_post(cpu); 1544 } 1545 1546 static void 1547 cpustop_handler_post(u_int cpu) 1548 { 1549 1550 CPU_CLR_ATOMIC(cpu, &started_cpus); 1551 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1552 1553 /* 1554 * We don't broadcast TLB invalidations to other CPUs when they are 1555 * stopped. Hence, we clear the TLB before resuming. 1556 */ 1557 invltlb_glob(); 1558 1559 #if defined(__amd64__) && (defined(DDB) || defined(GDB)) 1560 amd64_db_resume_dbreg(); 1561 #endif 1562 1563 if (cpu == 0 && cpustop_restartfunc != NULL) { 1564 cpustop_restartfunc(); 1565 cpustop_restartfunc = NULL; 1566 } 1567 } 1568 1569 /* 1570 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1571 * are resumed. 1572 */ 1573 void 1574 cpususpend_handler(void) 1575 { 1576 u_int cpu; 1577 1578 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1579 1580 cpu = PCPU_GET(cpuid); 1581 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1582 #ifdef __amd64__ 1583 fpususpend(susppcbs[cpu]->sp_fpususpend); 1584 #else 1585 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1586 #endif 1587 /* 1588 * suspended_cpus is cleared shortly after each AP is restarted 1589 * by a Startup IPI, so that the BSP can proceed to restarting 1590 * the next AP. 1591 * 1592 * resuming_cpus gets cleared when the AP completes 1593 * initialization after having been released by the BSP. 1594 * resuming_cpus is probably not the best name for the 1595 * variable, because it is actually a set of processors that 1596 * haven't resumed yet and haven't necessarily started resuming. 1597 * 1598 * Note that suspended_cpus is meaningful only for ACPI suspend 1599 * as it's not really used for Xen suspend since the APs are 1600 * automatically restored to the running state and the correct 1601 * context. For the same reason resumectx is never called in 1602 * that case. 1603 */ 1604 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1605 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1606 1607 /* 1608 * Invalidate the cache after setting the global status bits. 1609 * The last AP to set its bit may end up being an Owner of the 1610 * corresponding cache line in MOESI protocol. The AP may be 1611 * stopped before the cache line is written to the main memory. 1612 */ 1613 wbinvd(); 1614 } else { 1615 #ifdef __amd64__ 1616 fpuresume(susppcbs[cpu]->sp_fpususpend); 1617 #else 1618 npxresume(susppcbs[cpu]->sp_fpususpend); 1619 #endif 1620 pmap_init_pat(); 1621 initializecpu(); 1622 PCPU_SET(switchtime, 0); 1623 PCPU_SET(switchticks, ticks); 1624 1625 /* Indicate that we have restarted and restored the context. */ 1626 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1627 } 1628 1629 /* Wait for resume directive */ 1630 while (!CPU_ISSET(cpu, &toresume_cpus)) 1631 ia32_pause(); 1632 1633 /* Re-apply microcode updates. */ 1634 ucode_reload(); 1635 1636 #ifdef __i386__ 1637 /* Finish removing the identity mapping of low memory for this AP. */ 1638 invltlb_glob(); 1639 #endif 1640 1641 if (cpu_ops.cpu_resume) 1642 cpu_ops.cpu_resume(); 1643 #ifdef __amd64__ 1644 if (vmm_resume_p) 1645 vmm_resume_p(); 1646 #endif 1647 1648 /* Resume MCA and local APIC */ 1649 lapic_xapic_mode(); 1650 mca_resume(); 1651 lapic_setup(0); 1652 1653 /* Indicate that we are resumed */ 1654 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1655 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1656 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1657 } 1658 1659 /* 1660 * Handle an IPI_SWI by waking delayed SWI thread. 1661 */ 1662 void 1663 ipi_swi_handler(struct trapframe frame) 1664 { 1665 1666 intr_event_handle(clk_intr_event, &frame); 1667 } 1668 1669 /* 1670 * This is called once the rest of the system is up and running and we're 1671 * ready to let the AP's out of the pen. 1672 */ 1673 static void 1674 release_aps(void *dummy __unused) 1675 { 1676 1677 if (mp_ncpus == 1) 1678 return; 1679 atomic_store_rel_int(&aps_ready, 1); 1680 while (smp_started == 0) 1681 ia32_pause(); 1682 } 1683 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1684 1685 #ifdef COUNT_IPIS 1686 /* 1687 * Setup interrupt counters for IPI handlers. 1688 */ 1689 static void 1690 mp_ipi_intrcnt(void *dummy) 1691 { 1692 char buf[64]; 1693 int i; 1694 1695 CPU_FOREACH(i) { 1696 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1697 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1698 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1699 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1700 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1701 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1702 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1703 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1704 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1705 intrcnt_add(buf, &ipi_preempt_counts[i]); 1706 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1707 intrcnt_add(buf, &ipi_ast_counts[i]); 1708 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1709 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1710 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1711 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1712 } 1713 } 1714 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1715 #endif 1716