1 /*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_acpi.h" 31 #ifdef __i386__ 32 #include "opt_apic.h" 33 #endif 34 #include "opt_cpu.h" 35 #include "opt_ddb.h" 36 #include "opt_gdb.h" 37 #include "opt_kstack_pages.h" 38 #include "opt_pmap.h" 39 #include "opt_sched.h" 40 #include "opt_smp.h" 41 #include "opt_stack.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/asan.h> 46 #include <sys/bus.h> 47 #include <sys/cons.h> /* cngetc() */ 48 #include <sys/cpuset.h> 49 #include <sys/csan.h> 50 #ifdef GPROF 51 #include <sys/gmon.h> 52 #endif 53 #include <sys/interrupt.h> 54 #include <sys/kdb.h> 55 #include <sys/kernel.h> 56 #include <sys/ktr.h> 57 #include <sys/lock.h> 58 #include <sys/malloc.h> 59 #include <sys/memrange.h> 60 #include <sys/mutex.h> 61 #include <sys/pcpu.h> 62 #include <sys/proc.h> 63 #include <sys/sched.h> 64 #include <sys/smp.h> 65 #include <sys/sysctl.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <vm/pmap.h> 70 #include <vm/vm_kern.h> 71 #include <vm/vm_extern.h> 72 #include <vm/vm_map.h> 73 74 #include <x86/apicreg.h> 75 #include <machine/clock.h> 76 #include <machine/cpu.h> 77 #include <machine/cputypes.h> 78 #include <x86/mca.h> 79 #include <machine/md_var.h> 80 #include <machine/pcb.h> 81 #include <machine/psl.h> 82 #include <machine/smp.h> 83 #include <machine/specialreg.h> 84 #include <machine/stack.h> 85 #include <x86/ucode.h> 86 87 #ifdef DEV_ACPI 88 #include <contrib/dev/acpica/include/acpi.h> 89 #include <dev/acpica/acpivar.h> 90 #endif 91 92 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); 93 94 /* lock region used by kernel profiling */ 95 int mcount_lock; 96 97 int mp_naps; /* # of Applications processors */ 98 int boot_cpu_id = -1; /* designated BSP */ 99 100 /* AP uses this during bootstrap. Do not staticize. */ 101 char *bootSTK; 102 int bootAP; 103 104 /* Free these after use */ 105 void *bootstacks[MAXCPU]; 106 void *dpcpu; 107 108 struct pcb stoppcbs[MAXCPU]; 109 struct susppcb **susppcbs; 110 111 #ifdef COUNT_IPIS 112 /* Interrupt counts. */ 113 static u_long *ipi_preempt_counts[MAXCPU]; 114 static u_long *ipi_ast_counts[MAXCPU]; 115 u_long *ipi_invltlb_counts[MAXCPU]; 116 u_long *ipi_invlrng_counts[MAXCPU]; 117 u_long *ipi_invlpg_counts[MAXCPU]; 118 u_long *ipi_invlcache_counts[MAXCPU]; 119 u_long *ipi_rendezvous_counts[MAXCPU]; 120 static u_long *ipi_hardclock_counts[MAXCPU]; 121 #endif 122 123 /* Default cpu_ops implementation. */ 124 struct cpu_ops cpu_ops; 125 126 /* 127 * Local data and functions. 128 */ 129 130 static volatile cpuset_t ipi_stop_nmi_pending; 131 132 volatile cpuset_t resuming_cpus; 133 volatile cpuset_t toresume_cpus; 134 135 /* used to hold the AP's until we are ready to release them */ 136 struct mtx ap_boot_mtx; 137 138 /* Set to 1 once we're ready to let the APs out of the pen. */ 139 volatile int aps_ready = 0; 140 141 /* 142 * Store data from cpu_add() until later in the boot when we actually setup 143 * the APs. 144 */ 145 struct cpu_info *cpu_info; 146 int *apic_cpuids; 147 int cpu_apic_ids[MAXCPU]; 148 _Static_assert(MAXCPU <= MAX_APIC_ID, 149 "MAXCPU cannot be larger that MAX_APIC_ID"); 150 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, 151 "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); 152 153 static void release_aps(void *dummy); 154 static void cpustop_handler_post(u_int cpu); 155 156 static int hyperthreading_allowed = 1; 157 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 158 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 159 160 static int hyperthreading_intr_allowed = 0; 161 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, 162 &hyperthreading_intr_allowed, 0, 163 "Allow interrupts on HTT logical CPUs"); 164 165 static struct topo_node topo_root; 166 167 static int pkg_id_shift; 168 static int node_id_shift; 169 static int core_id_shift; 170 static int disabled_cpus; 171 172 struct cache_info { 173 int id_shift; 174 int present; 175 } static caches[MAX_CACHE_LEVELS]; 176 177 static bool stop_mwait = false; 178 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, 179 "Use MONITOR/MWAIT when stopping CPU, if available"); 180 181 void 182 mem_range_AP_init(void) 183 { 184 185 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 186 mem_range_softc.mr_op->initAP(&mem_range_softc); 187 } 188 189 /* 190 * Round up to the next power of two, if necessary, and then 191 * take log2. 192 * Returns -1 if argument is zero. 193 */ 194 static __inline int 195 mask_width(u_int x) 196 { 197 198 return (fls(x << (1 - powerof2(x))) - 1); 199 } 200 201 /* 202 * Add a cache level to the cache topology description. 203 */ 204 static int 205 add_deterministic_cache(int type, int level, int share_count) 206 { 207 208 if (type == 0) 209 return (0); 210 if (type > 3) { 211 printf("unexpected cache type %d\n", type); 212 return (1); 213 } 214 if (type == 2) /* ignore instruction cache */ 215 return (1); 216 if (level == 0 || level > MAX_CACHE_LEVELS) { 217 printf("unexpected cache level %d\n", level); 218 return (1); 219 } 220 221 if (caches[level - 1].present) { 222 printf("WARNING: multiple entries for L%u data cache\n", level); 223 printf("%u => %u\n", caches[level - 1].id_shift, 224 mask_width(share_count)); 225 } 226 caches[level - 1].id_shift = mask_width(share_count); 227 caches[level - 1].present = 1; 228 229 if (caches[level - 1].id_shift > pkg_id_shift) { 230 printf("WARNING: L%u data cache covers more " 231 "APIC IDs than a package (%u > %u)\n", level, 232 caches[level - 1].id_shift, pkg_id_shift); 233 caches[level - 1].id_shift = pkg_id_shift; 234 } 235 if (caches[level - 1].id_shift < core_id_shift) { 236 printf("WARNING: L%u data cache covers fewer " 237 "APIC IDs than a core (%u < %u)\n", level, 238 caches[level - 1].id_shift, core_id_shift); 239 caches[level - 1].id_shift = core_id_shift; 240 } 241 242 return (1); 243 } 244 245 /* 246 * Determine topology of processing units and caches for AMD CPUs. 247 * See: 248 * - AMD CPUID Specification (Publication # 25481) 249 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 250 * - BKDG For AMD Family 10h Processors (Publication # 31116) 251 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 252 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 253 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) 254 */ 255 static void 256 topo_probe_amd(void) 257 { 258 u_int p[4]; 259 uint64_t v; 260 int level; 261 int nodes_per_socket; 262 int share_count; 263 int type; 264 int i; 265 266 /* No multi-core capability. */ 267 if ((amd_feature2 & AMDID2_CMP) == 0) 268 return; 269 270 /* For families 10h and newer. */ 271 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 272 AMDID_COREID_SIZE_SHIFT; 273 274 /* For 0Fh family. */ 275 if (pkg_id_shift == 0) 276 pkg_id_shift = 277 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 278 279 /* 280 * Families prior to 16h define the following value as 281 * cores per compute unit and we don't really care about the AMD 282 * compute units at the moment. Perhaps we should treat them as 283 * cores and cores within the compute units as hardware threads, 284 * but that's up for debate. 285 * Later families define the value as threads per compute unit, 286 * so we are following AMD's nomenclature here. 287 */ 288 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 289 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 290 cpuid_count(0x8000001e, 0, p); 291 share_count = ((p[1] >> 8) & 0xff) + 1; 292 core_id_shift = mask_width(share_count); 293 294 /* 295 * For Zen (17h), gather Nodes per Processor. Each node is a 296 * Zeppelin die; TR and EPYC CPUs will have multiple dies per 297 * package. Communication latency between dies is higher than 298 * within them. 299 */ 300 nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; 301 node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); 302 } 303 304 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 305 for (i = 0; ; i++) { 306 cpuid_count(0x8000001d, i, p); 307 type = p[0] & 0x1f; 308 level = (p[0] >> 5) & 0x7; 309 share_count = 1 + ((p[0] >> 14) & 0xfff); 310 311 if (!add_deterministic_cache(type, level, share_count)) 312 break; 313 } 314 } else { 315 if (cpu_exthigh >= 0x80000005) { 316 cpuid_count(0x80000005, 0, p); 317 if (((p[2] >> 24) & 0xff) != 0) { 318 caches[0].id_shift = 0; 319 caches[0].present = 1; 320 } 321 } 322 if (cpu_exthigh >= 0x80000006) { 323 cpuid_count(0x80000006, 0, p); 324 if (((p[2] >> 16) & 0xffff) != 0) { 325 caches[1].id_shift = 0; 326 caches[1].present = 1; 327 } 328 if (((p[3] >> 18) & 0x3fff) != 0) { 329 nodes_per_socket = 1; 330 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 331 /* 332 * Handle multi-node processors that 333 * have multiple chips, each with its 334 * own L3 cache, on the same die. 335 */ 336 v = rdmsr(0xc001100c); 337 nodes_per_socket = 1 + ((v >> 3) & 0x7); 338 } 339 caches[2].id_shift = 340 pkg_id_shift - mask_width(nodes_per_socket); 341 caches[2].present = 1; 342 } 343 } 344 } 345 } 346 347 /* 348 * Determine topology of processing units for Intel CPUs 349 * using CPUID Leaf 1 and Leaf 4, if supported. 350 * See: 351 * - Intel 64 Architecture Processor Topology Enumeration 352 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 353 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 354 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 355 */ 356 static void 357 topo_probe_intel_0x4(void) 358 { 359 u_int p[4]; 360 int max_cores; 361 int max_logical; 362 363 /* Both zero and one here mean one logical processor per package. */ 364 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 365 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 366 if (max_logical <= 1) 367 return; 368 369 if (cpu_high >= 0x4) { 370 cpuid_count(0x04, 0, p); 371 max_cores = ((p[0] >> 26) & 0x3f) + 1; 372 } else 373 max_cores = 1; 374 375 core_id_shift = mask_width(max_logical/max_cores); 376 KASSERT(core_id_shift >= 0, 377 ("intel topo: max_cores > max_logical\n")); 378 pkg_id_shift = core_id_shift + mask_width(max_cores); 379 } 380 381 /* 382 * Determine topology of processing units for Intel CPUs 383 * using CPUID Leaf 1Fh or 0Bh, if supported. 384 * See: 385 * - Intel 64 Architecture Processor Topology Enumeration 386 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 387 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 388 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 389 */ 390 static void 391 topo_probe_intel_0xb(void) 392 { 393 u_int leaf; 394 u_int p[4] = { 0 }; 395 int bits; 396 int type; 397 int i; 398 399 /* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */ 400 if (cpu_high >= 0x1f) { 401 leaf = 0x1f; 402 cpuid_count(leaf, 0, p); 403 } 404 /* Fall back to leaf 0Bh (Extended Topology Enumeration). */ 405 if (p[1] == 0) { 406 leaf = 0x0b; 407 cpuid_count(leaf, 0, p); 408 } 409 /* Fall back to leaf 04h (Deterministic Cache Parameters). */ 410 if (p[1] == 0) { 411 topo_probe_intel_0x4(); 412 return; 413 } 414 415 /* We only support three levels for now. */ 416 for (i = 0; ; i++) { 417 cpuid_count(leaf, i, p); 418 419 bits = p[0] & 0x1f; 420 type = (p[2] >> 8) & 0xff; 421 422 if (type == 0) 423 break; 424 425 if (type == CPUID_TYPE_SMT) 426 core_id_shift = bits; 427 else if (type == CPUID_TYPE_CORE) 428 pkg_id_shift = bits; 429 else if (bootverbose) 430 printf("Topology level type %d shift: %d\n", type, bits); 431 } 432 433 if (pkg_id_shift < core_id_shift) { 434 printf("WARNING: core covers more APIC IDs than a package\n"); 435 core_id_shift = pkg_id_shift; 436 } 437 } 438 439 /* 440 * Determine topology of caches for Intel CPUs. 441 * See: 442 * - Intel 64 Architecture Processor Topology Enumeration 443 * - Intel 64 and IA-32 Architectures Software Developer’s Manual 444 * Volume 2A: Instruction Set Reference, A-M, 445 * CPUID instruction 446 */ 447 static void 448 topo_probe_intel_caches(void) 449 { 450 u_int p[4]; 451 int level; 452 int share_count; 453 int type; 454 int i; 455 456 if (cpu_high < 0x4) { 457 /* 458 * Available cache level and sizes can be determined 459 * via CPUID leaf 2, but that requires a huge table of hardcoded 460 * values, so for now just assume L1 and L2 caches potentially 461 * shared only by HTT processing units, if HTT is present. 462 */ 463 caches[0].id_shift = pkg_id_shift; 464 caches[0].present = 1; 465 caches[1].id_shift = pkg_id_shift; 466 caches[1].present = 1; 467 return; 468 } 469 470 for (i = 0; ; i++) { 471 cpuid_count(0x4, i, p); 472 type = p[0] & 0x1f; 473 level = (p[0] >> 5) & 0x7; 474 share_count = 1 + ((p[0] >> 14) & 0xfff); 475 476 if (!add_deterministic_cache(type, level, share_count)) 477 break; 478 } 479 } 480 481 /* 482 * Determine topology of processing units and caches for Intel CPUs. 483 * See: 484 * - Intel 64 Architecture Processor Topology Enumeration 485 */ 486 static void 487 topo_probe_intel(void) 488 { 489 490 /* 491 * Note that 0x1 <= cpu_high < 4 case should be 492 * compatible with topo_probe_intel_0x4() logic when 493 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 494 * or it should trigger the fallback otherwise. 495 */ 496 if (cpu_high >= 0xb) 497 topo_probe_intel_0xb(); 498 else if (cpu_high >= 0x1) 499 topo_probe_intel_0x4(); 500 501 topo_probe_intel_caches(); 502 } 503 504 /* 505 * Topology information is queried only on BSP, on which this 506 * code runs and for which it can query CPUID information. 507 * Then topology is extrapolated on all packages using an 508 * assumption that APIC ID to hardware component ID mapping is 509 * homogenious. 510 * That doesn't necesserily imply that the topology is uniform. 511 */ 512 void 513 topo_probe(void) 514 { 515 static int cpu_topo_probed = 0; 516 struct x86_topo_layer { 517 int type; 518 int subtype; 519 int id_shift; 520 } topo_layers[MAX_CACHE_LEVELS + 5]; 521 struct topo_node *parent; 522 struct topo_node *node; 523 int layer; 524 int nlayers; 525 int node_id; 526 int i; 527 #if defined(DEV_ACPI) && MAXMEMDOM > 1 528 int d, domain; 529 #endif 530 531 if (cpu_topo_probed) 532 return; 533 534 CPU_ZERO(&logical_cpus_mask); 535 536 if (mp_ncpus <= 1) 537 ; /* nothing */ 538 else if (cpu_vendor_id == CPU_VENDOR_AMD || 539 cpu_vendor_id == CPU_VENDOR_HYGON) 540 topo_probe_amd(); 541 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 542 topo_probe_intel(); 543 544 KASSERT(pkg_id_shift >= core_id_shift, 545 ("bug in APIC topology discovery")); 546 547 nlayers = 0; 548 bzero(topo_layers, sizeof(topo_layers)); 549 550 topo_layers[nlayers].type = TOPO_TYPE_PKG; 551 topo_layers[nlayers].id_shift = pkg_id_shift; 552 if (bootverbose) 553 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 554 nlayers++; 555 556 if (pkg_id_shift > node_id_shift && node_id_shift != 0) { 557 topo_layers[nlayers].type = TOPO_TYPE_GROUP; 558 topo_layers[nlayers].id_shift = node_id_shift; 559 if (bootverbose) 560 printf("Node ID shift: %u\n", 561 topo_layers[nlayers].id_shift); 562 nlayers++; 563 } 564 565 /* 566 * Consider all caches to be within a package/chip 567 * and "in front" of all sub-components like 568 * cores and hardware threads. 569 */ 570 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 571 if (caches[i].present) { 572 if (node_id_shift != 0) 573 KASSERT(caches[i].id_shift <= node_id_shift, 574 ("bug in APIC topology discovery")); 575 KASSERT(caches[i].id_shift <= pkg_id_shift, 576 ("bug in APIC topology discovery")); 577 KASSERT(caches[i].id_shift >= core_id_shift, 578 ("bug in APIC topology discovery")); 579 580 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 581 topo_layers[nlayers].subtype = i + 1; 582 topo_layers[nlayers].id_shift = caches[i].id_shift; 583 if (bootverbose) 584 printf("L%u cache ID shift: %u\n", 585 topo_layers[nlayers].subtype, 586 topo_layers[nlayers].id_shift); 587 nlayers++; 588 } 589 } 590 591 if (pkg_id_shift > core_id_shift) { 592 topo_layers[nlayers].type = TOPO_TYPE_CORE; 593 topo_layers[nlayers].id_shift = core_id_shift; 594 if (bootverbose) 595 printf("Core ID shift: %u\n", 596 topo_layers[nlayers].id_shift); 597 nlayers++; 598 } 599 600 topo_layers[nlayers].type = TOPO_TYPE_PU; 601 topo_layers[nlayers].id_shift = 0; 602 nlayers++; 603 604 #if defined(DEV_ACPI) && MAXMEMDOM > 1 605 if (vm_ndomains > 1) { 606 for (layer = 0; layer < nlayers; ++layer) { 607 for (i = 0; i <= max_apic_id; ++i) { 608 if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0) 609 domain = -1; 610 if (!cpu_info[i].cpu_present) 611 continue; 612 d = acpi_pxm_get_cpu_locality(i); 613 if (domain >= 0 && domain != d) 614 break; 615 domain = d; 616 } 617 if (i > max_apic_id) 618 break; 619 } 620 KASSERT(layer < nlayers, ("NUMA domain smaller than PU")); 621 memmove(&topo_layers[layer+1], &topo_layers[layer], 622 sizeof(*topo_layers) * (nlayers - layer)); 623 topo_layers[layer].type = TOPO_TYPE_NODE; 624 topo_layers[layer].subtype = CG_SHARE_NONE; 625 nlayers++; 626 } 627 #endif 628 629 topo_init_root(&topo_root); 630 for (i = 0; i <= max_apic_id; ++i) { 631 if (!cpu_info[i].cpu_present) 632 continue; 633 634 parent = &topo_root; 635 for (layer = 0; layer < nlayers; ++layer) { 636 #if defined(DEV_ACPI) && MAXMEMDOM > 1 637 if (topo_layers[layer].type == TOPO_TYPE_NODE) { 638 node_id = acpi_pxm_get_cpu_locality(i); 639 } else 640 #endif 641 node_id = i >> topo_layers[layer].id_shift; 642 parent = topo_add_node_by_hwid(parent, node_id, 643 topo_layers[layer].type, 644 topo_layers[layer].subtype); 645 } 646 } 647 648 parent = &topo_root; 649 for (layer = 0; layer < nlayers; ++layer) { 650 #if defined(DEV_ACPI) && MAXMEMDOM > 1 651 if (topo_layers[layer].type == TOPO_TYPE_NODE) 652 node_id = acpi_pxm_get_cpu_locality(boot_cpu_id); 653 else 654 #endif 655 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 656 node = topo_find_node_by_hwid(parent, node_id, 657 topo_layers[layer].type, 658 topo_layers[layer].subtype); 659 topo_promote_child(node); 660 parent = node; 661 } 662 663 cpu_topo_probed = 1; 664 } 665 666 /* 667 * Assign logical CPU IDs to local APICs. 668 */ 669 void 670 assign_cpu_ids(void) 671 { 672 struct topo_node *node; 673 u_int smt_mask; 674 int nhyper; 675 676 smt_mask = (1u << core_id_shift) - 1; 677 678 /* 679 * Assign CPU IDs to local APIC IDs and disable any CPUs 680 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 681 */ 682 mp_ncpus = 0; 683 nhyper = 0; 684 TOPO_FOREACH(node, &topo_root) { 685 if (node->type != TOPO_TYPE_PU) 686 continue; 687 688 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 689 cpu_info[node->hwid].cpu_hyperthread = 1; 690 691 if (resource_disabled("lapic", node->hwid)) { 692 if (node->hwid != boot_cpu_id) 693 cpu_info[node->hwid].cpu_disabled = 1; 694 else 695 printf("Cannot disable BSP, APIC ID = %d\n", 696 node->hwid); 697 } 698 699 if (!hyperthreading_allowed && 700 cpu_info[node->hwid].cpu_hyperthread) 701 cpu_info[node->hwid].cpu_disabled = 1; 702 703 if (mp_ncpus >= MAXCPU) 704 cpu_info[node->hwid].cpu_disabled = 1; 705 706 if (cpu_info[node->hwid].cpu_disabled) { 707 disabled_cpus++; 708 continue; 709 } 710 711 if (cpu_info[node->hwid].cpu_hyperthread) 712 nhyper++; 713 714 cpu_apic_ids[mp_ncpus] = node->hwid; 715 apic_cpuids[node->hwid] = mp_ncpus; 716 topo_set_pu_id(node, mp_ncpus); 717 mp_ncpus++; 718 } 719 720 KASSERT(mp_maxid >= mp_ncpus - 1, 721 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 722 mp_ncpus)); 723 724 mp_ncores = mp_ncpus - nhyper; 725 smp_threads_per_core = mp_ncpus / mp_ncores; 726 } 727 728 /* 729 * Print various information about the SMP system hardware and setup. 730 */ 731 void 732 cpu_mp_announce(void) 733 { 734 struct topo_node *node; 735 const char *hyperthread; 736 struct topo_analysis topology; 737 738 printf("FreeBSD/SMP: "); 739 if (topo_analyze(&topo_root, 1, &topology)) { 740 printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); 741 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 742 printf(" x %d groups", 743 topology.entities[TOPO_LEVEL_GROUP]); 744 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 745 printf(" x %d cache groups", 746 topology.entities[TOPO_LEVEL_CACHEGROUP]); 747 if (topology.entities[TOPO_LEVEL_CORE] > 0) 748 printf(" x %d core(s)", 749 topology.entities[TOPO_LEVEL_CORE]); 750 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 751 printf(" x %d hardware threads", 752 topology.entities[TOPO_LEVEL_THREAD]); 753 } else { 754 printf("Non-uniform topology"); 755 } 756 printf("\n"); 757 758 if (disabled_cpus) { 759 printf("FreeBSD/SMP Online: "); 760 if (topo_analyze(&topo_root, 0, &topology)) { 761 printf("%d package(s)", 762 topology.entities[TOPO_LEVEL_PKG]); 763 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 764 printf(" x %d groups", 765 topology.entities[TOPO_LEVEL_GROUP]); 766 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 767 printf(" x %d cache groups", 768 topology.entities[TOPO_LEVEL_CACHEGROUP]); 769 if (topology.entities[TOPO_LEVEL_CORE] > 0) 770 printf(" x %d core(s)", 771 topology.entities[TOPO_LEVEL_CORE]); 772 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 773 printf(" x %d hardware threads", 774 topology.entities[TOPO_LEVEL_THREAD]); 775 } else { 776 printf("Non-uniform topology"); 777 } 778 printf("\n"); 779 } 780 781 if (!bootverbose) 782 return; 783 784 TOPO_FOREACH(node, &topo_root) { 785 switch (node->type) { 786 case TOPO_TYPE_PKG: 787 printf("Package HW ID = %u\n", node->hwid); 788 break; 789 case TOPO_TYPE_CORE: 790 printf("\tCore HW ID = %u\n", node->hwid); 791 break; 792 case TOPO_TYPE_PU: 793 if (cpu_info[node->hwid].cpu_hyperthread) 794 hyperthread = "/HT"; 795 else 796 hyperthread = ""; 797 798 if (node->subtype == 0) 799 printf("\t\tCPU (AP%s): APIC ID: %u" 800 "(disabled)\n", hyperthread, node->hwid); 801 else if (node->id == 0) 802 printf("\t\tCPU0 (BSP): APIC ID: %u\n", 803 node->hwid); 804 else 805 printf("\t\tCPU%u (AP%s): APIC ID: %u\n", 806 node->id, hyperthread, node->hwid); 807 break; 808 default: 809 /* ignored */ 810 break; 811 } 812 } 813 } 814 815 /* 816 * Add a scheduling group, a group of logical processors sharing 817 * a particular cache (and, thus having an affinity), to the scheduling 818 * topology. 819 * This function recursively works on lower level caches. 820 */ 821 static void 822 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 823 { 824 struct topo_node *node; 825 int nchildren; 826 int ncores; 827 int i; 828 829 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || 830 root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP, 831 ("x86topo_add_sched_group: bad type: %u", root->type)); 832 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 833 cg_root->cg_count = root->cpu_count; 834 if (root->type == TOPO_TYPE_CACHE) 835 cg_root->cg_level = root->subtype; 836 else 837 cg_root->cg_level = CG_SHARE_NONE; 838 if (root->type == TOPO_TYPE_NODE) 839 cg_root->cg_flags = CG_FLAG_NODE; 840 else 841 cg_root->cg_flags = 0; 842 843 /* 844 * Check how many core nodes we have under the given root node. 845 * If we have multiple logical processors, but not multiple 846 * cores, then those processors must be hardware threads. 847 */ 848 ncores = 0; 849 node = root; 850 while (node != NULL) { 851 if (node->type != TOPO_TYPE_CORE) { 852 node = topo_next_node(root, node); 853 continue; 854 } 855 856 ncores++; 857 node = topo_next_nonchild_node(root, node); 858 } 859 860 if (cg_root->cg_level != CG_SHARE_NONE && 861 root->cpu_count > 1 && ncores < 2) 862 cg_root->cg_flags |= CG_FLAG_SMT; 863 864 /* 865 * Find out how many cache nodes we have under the given root node. 866 * We ignore cache nodes that cover all the same processors as the 867 * root node. Also, we do not descend below found cache nodes. 868 * That is, we count top-level "non-redundant" caches under the root 869 * node. 870 */ 871 nchildren = 0; 872 node = root; 873 while (node != NULL) { 874 /* 875 * When some APICs are disabled by tunables, nodes can end up 876 * with an empty cpuset. Nodes with an empty cpuset will be 877 * translated into cpu groups with empty cpusets. smp_topo_fill 878 * will then set cg_first and cg_last to -1. This isn't 879 * correctly handled in all functions. E.g. when 880 * cpu_search_lowest and cpu_search_highest loop through all 881 * cpus, they call CPU_ISSET on cpu -1 which ends up in a 882 * general protection fault. 883 * 884 * We could fix the scheduler to handle empty cpu groups 885 * correctly. Nevertheless, empty cpu groups are causing 886 * overhead for no value. So, it makes more sense to just don't 887 * create them. 888 */ 889 if (CPU_EMPTY(&node->cpuset)) { 890 node = topo_next_node(root, node); 891 continue; 892 } 893 if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) { 894 if (node->type == TOPO_TYPE_CACHE && 895 cg_root->cg_level < node->subtype) 896 cg_root->cg_level = node->subtype; 897 if (node->type == TOPO_TYPE_NODE) 898 cg_root->cg_flags |= CG_FLAG_NODE; 899 node = topo_next_node(root, node); 900 continue; 901 } 902 if (node->type != TOPO_TYPE_GROUP && 903 node->type != TOPO_TYPE_NODE && 904 node->type != TOPO_TYPE_CACHE) { 905 node = topo_next_node(root, node); 906 continue; 907 } 908 nchildren++; 909 node = topo_next_nonchild_node(root, node); 910 } 911 912 /* 913 * We are not interested in nodes including only one CPU each. 914 */ 915 if (nchildren == root->cpu_count) 916 return; 917 918 /* 919 * We are not interested in nodes without children. 920 */ 921 cg_root->cg_children = nchildren; 922 if (nchildren == 0) 923 return; 924 925 cg_root->cg_child = smp_topo_alloc(nchildren); 926 927 /* 928 * Now find again the same cache nodes as above and recursively 929 * build scheduling topologies for them. 930 */ 931 node = root; 932 i = 0; 933 while (node != NULL) { 934 if ((node->type != TOPO_TYPE_GROUP && 935 node->type != TOPO_TYPE_NODE && 936 node->type != TOPO_TYPE_CACHE) || 937 CPU_CMP(&node->cpuset, &root->cpuset) == 0 || 938 CPU_EMPTY(&node->cpuset)) { 939 node = topo_next_node(root, node); 940 continue; 941 } 942 cg_root->cg_child[i].cg_parent = cg_root; 943 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 944 i++; 945 node = topo_next_nonchild_node(root, node); 946 } 947 } 948 949 /* 950 * Build the MI scheduling topology from the discovered hardware topology. 951 */ 952 struct cpu_group * 953 cpu_topo(void) 954 { 955 struct cpu_group *cg_root; 956 957 if (mp_ncpus <= 1) 958 return (smp_topo_none()); 959 960 cg_root = smp_topo_alloc(1); 961 x86topo_add_sched_group(&topo_root, cg_root); 962 return (cg_root); 963 } 964 965 static void 966 cpu_alloc(void *dummy __unused) 967 { 968 /* 969 * Dynamically allocate the arrays that depend on the 970 * maximum APIC ID. 971 */ 972 cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, 973 M_WAITOK | M_ZERO); 974 apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, 975 M_WAITOK | M_ZERO); 976 } 977 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); 978 979 /* 980 * Add a logical CPU to the topology. 981 */ 982 void 983 cpu_add(u_int apic_id, char boot_cpu) 984 { 985 986 if (apic_id > max_apic_id) { 987 panic("SMP: APIC ID %d too high", apic_id); 988 return; 989 } 990 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", 991 apic_id)); 992 cpu_info[apic_id].cpu_present = 1; 993 if (boot_cpu) { 994 KASSERT(boot_cpu_id == -1, 995 ("CPU %u claims to be BSP, but CPU %u already is", apic_id, 996 boot_cpu_id)); 997 boot_cpu_id = apic_id; 998 cpu_info[apic_id].cpu_bsp = 1; 999 } 1000 if (bootverbose) 1001 printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : 1002 "AP"); 1003 } 1004 1005 void 1006 cpu_mp_setmaxid(void) 1007 { 1008 1009 /* 1010 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 1011 * If there were no calls to cpu_add() assume this is a UP system. 1012 */ 1013 if (mp_ncpus == 0) 1014 mp_ncpus = 1; 1015 } 1016 1017 int 1018 cpu_mp_probe(void) 1019 { 1020 1021 /* 1022 * Always record BSP in CPU map so that the mbuf init code works 1023 * correctly. 1024 */ 1025 CPU_SETOF(0, &all_cpus); 1026 return (mp_ncpus > 1); 1027 } 1028 1029 /* 1030 * AP CPU's call this to initialize themselves. 1031 */ 1032 void 1033 init_secondary_tail(void) 1034 { 1035 u_int cpuid; 1036 1037 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); 1038 1039 /* 1040 * On real hardware, switch to x2apic mode if possible. Do it 1041 * after aps_ready was signalled, to avoid manipulating the 1042 * mode while BSP might still want to send some IPI to us 1043 * (second startup IPI is ignored on modern hardware etc). 1044 */ 1045 lapic_xapic_mode(); 1046 1047 /* Initialize the PAT MSR. */ 1048 pmap_init_pat(); 1049 1050 /* set up CPU registers and state */ 1051 cpu_setregs(); 1052 1053 /* set up SSE/NX */ 1054 initializecpu(); 1055 1056 /* set up FPU state on the AP */ 1057 #ifdef __amd64__ 1058 fpuinit(); 1059 #else 1060 npxinit(false); 1061 #endif 1062 1063 if (cpu_ops.cpu_init) 1064 cpu_ops.cpu_init(); 1065 1066 /* A quick check from sanity claus */ 1067 cpuid = PCPU_GET(cpuid); 1068 if (PCPU_GET(apic_id) != lapic_id()) { 1069 printf("SMP: cpuid = %d\n", cpuid); 1070 printf("SMP: actual apic_id = %d\n", lapic_id()); 1071 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 1072 panic("cpuid mismatch! boom!!"); 1073 } 1074 1075 /* Initialize curthread. */ 1076 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 1077 PCPU_SET(curthread, PCPU_GET(idlethread)); 1078 schedinit_ap(); 1079 1080 mtx_lock_spin(&ap_boot_mtx); 1081 1082 mca_init(); 1083 1084 /* Init local apic for irq's */ 1085 lapic_setup(1); 1086 1087 /* Set memory range attributes for this CPU to match the BSP */ 1088 mem_range_AP_init(); 1089 1090 smp_cpus++; 1091 1092 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 1093 if (bootverbose) 1094 printf("SMP: AP CPU #%d Launched!\n", cpuid); 1095 else 1096 printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", 1097 cpuid, smp_cpus == mp_ncpus ? "\n" : " "); 1098 1099 /* Determine if we are a logical CPU. */ 1100 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 1101 CPU_SET(cpuid, &logical_cpus_mask); 1102 1103 if (bootverbose) 1104 lapic_dump("AP"); 1105 1106 if (smp_cpus == mp_ncpus) { 1107 /* enable IPI's, tlb shootdown, freezes etc */ 1108 atomic_store_rel_int(&smp_started, 1); 1109 } 1110 1111 #ifdef __amd64__ 1112 if (pmap_pcid_enabled) 1113 load_cr4(rcr4() | CR4_PCIDE); 1114 load_ds(_udatasel); 1115 load_es(_udatasel); 1116 load_fs(_ufssel); 1117 #endif 1118 1119 mtx_unlock_spin(&ap_boot_mtx); 1120 1121 /* Wait until all the AP's are up. */ 1122 while (atomic_load_acq_int(&smp_started) == 0) 1123 ia32_pause(); 1124 1125 #ifndef EARLY_AP_STARTUP 1126 /* Start per-CPU event timers. */ 1127 cpu_initclocks_ap(); 1128 #endif 1129 1130 kcsan_cpu_init(cpuid); 1131 1132 sched_ap_entry(); 1133 1134 panic("scheduler returned us to %s", __func__); 1135 /* NOTREACHED */ 1136 } 1137 1138 static void 1139 smp_after_idle_runnable(void *arg __unused) 1140 { 1141 int cpu; 1142 1143 if (mp_ncpus == 1) 1144 return; 1145 1146 KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__)); 1147 1148 /* 1149 * Wait for all APs to handle an interrupt. After that, we know that 1150 * the APs have entered the scheduler at least once, so the boot stacks 1151 * are safe to free. 1152 */ 1153 smp_rendezvous(smp_no_rendezvous_barrier, NULL, 1154 smp_no_rendezvous_barrier, NULL); 1155 1156 for (cpu = 1; cpu < mp_ncpus; cpu++) { 1157 kmem_free(bootstacks[cpu], kstack_pages * PAGE_SIZE); 1158 } 1159 } 1160 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, 1161 smp_after_idle_runnable, NULL); 1162 1163 /* 1164 * We tell the I/O APIC code about all the CPUs we want to receive 1165 * interrupts. If we don't want certain CPUs to receive IRQs we 1166 * can simply not tell the I/O APIC code about them in this function. 1167 * We also do not tell it about the BSP since it tells itself about 1168 * the BSP internally to work with UP kernels and on UP machines. 1169 */ 1170 void 1171 set_interrupt_apic_ids(void) 1172 { 1173 u_int i, apic_id; 1174 1175 for (i = 0; i < MAXCPU; i++) { 1176 apic_id = cpu_apic_ids[i]; 1177 if (apic_id == -1) 1178 continue; 1179 if (cpu_info[apic_id].cpu_bsp) 1180 continue; 1181 if (cpu_info[apic_id].cpu_disabled) 1182 continue; 1183 1184 /* Don't let hyperthreads service interrupts. */ 1185 if (cpu_info[apic_id].cpu_hyperthread && 1186 !hyperthreading_intr_allowed) 1187 continue; 1188 1189 intr_add_cpu(i); 1190 } 1191 } 1192 1193 #ifdef COUNT_XINVLTLB_HITS 1194 u_int xhits_gbl[MAXCPU]; 1195 u_int xhits_pg[MAXCPU]; 1196 u_int xhits_rng[MAXCPU]; 1197 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1198 ""); 1199 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1200 sizeof(xhits_gbl), "IU", ""); 1201 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1202 sizeof(xhits_pg), "IU", ""); 1203 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1204 sizeof(xhits_rng), "IU", ""); 1205 1206 u_int ipi_global; 1207 u_int ipi_page; 1208 u_int ipi_range; 1209 u_int ipi_range_size; 1210 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1211 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1212 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1213 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1214 0, ""); 1215 #endif /* COUNT_XINVLTLB_HITS */ 1216 1217 /* 1218 * Init and startup IPI. 1219 */ 1220 void 1221 ipi_startup(int apic_id, int vector) 1222 { 1223 1224 /* 1225 * This attempts to follow the algorithm described in the 1226 * Intel Multiprocessor Specification v1.4 in section B.4. 1227 * For each IPI, we allow the local APIC ~20us to deliver the 1228 * IPI. If that times out, we panic. 1229 */ 1230 1231 /* 1232 * first we do an INIT IPI: this INIT IPI might be run, resetting 1233 * and running the target CPU. OR this INIT IPI might be latched (P5 1234 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1235 * ignored. 1236 */ 1237 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1238 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1239 lapic_ipi_wait(100); 1240 1241 /* Explicitly deassert the INIT IPI. */ 1242 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1243 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1244 apic_id); 1245 1246 DELAY(10000); /* wait ~10mS */ 1247 1248 /* 1249 * next we do a STARTUP IPI: the previous INIT IPI might still be 1250 * latched, (P5 bug) this 1st STARTUP would then terminate 1251 * immediately, and the previously started INIT IPI would continue. OR 1252 * the previous INIT IPI has already run. and this STARTUP IPI will 1253 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1254 * will run. 1255 */ 1256 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1257 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1258 vector, apic_id); 1259 if (!lapic_ipi_wait(100)) 1260 panic("Failed to deliver first STARTUP IPI to APIC %d", 1261 apic_id); 1262 DELAY(200); /* wait ~200uS */ 1263 1264 /* 1265 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1266 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1267 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1268 * recognized after hardware RESET or INIT IPI. 1269 */ 1270 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1271 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1272 vector, apic_id); 1273 if (!lapic_ipi_wait(100)) 1274 panic("Failed to deliver second STARTUP IPI to APIC %d", 1275 apic_id); 1276 1277 DELAY(200); /* wait ~200uS */ 1278 } 1279 1280 static bool 1281 ipi_bitmap_set(int cpu, u_int ipi) 1282 { 1283 u_int bitmap, old, new; 1284 u_int *cpu_bitmap; 1285 1286 bitmap = 1 << ipi; 1287 cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap; 1288 old = *cpu_bitmap; 1289 for (;;) { 1290 if ((old & bitmap) != 0) 1291 break; 1292 new = old | bitmap; 1293 if (atomic_fcmpset_int(cpu_bitmap, &old, new)) 1294 break; 1295 } 1296 return (old != 0); 1297 } 1298 1299 /* 1300 * Send an IPI to specified CPU handling the bitmap logic. 1301 */ 1302 static void 1303 ipi_send_cpu(int cpu, u_int ipi) 1304 { 1305 1306 KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1, 1307 ("IPI to non-existent CPU %d", cpu)); 1308 1309 if (IPI_IS_BITMAPED(ipi)) { 1310 if (ipi_bitmap_set(cpu, ipi)) 1311 return; 1312 ipi = IPI_BITMAP_VECTOR; 1313 } 1314 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1315 } 1316 1317 void 1318 ipi_bitmap_handler(struct trapframe frame) 1319 { 1320 struct trapframe *oldframe; 1321 struct thread *td; 1322 int cpu = PCPU_GET(cpuid); 1323 u_int ipi_bitmap; 1324 1325 kasan_mark(&frame, sizeof(frame), sizeof(frame), 0); 1326 1327 td = curthread; 1328 ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]-> 1329 pc_ipi_bitmap); 1330 1331 /* 1332 * sched_preempt() must be called to clear the pending preempt 1333 * IPI to enable delivery of further preempts. However, the 1334 * critical section will cause extra scheduler lock thrashing 1335 * when used unconditionally. Only critical_enter() if 1336 * hardclock must also run, which requires the section entry. 1337 */ 1338 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1339 critical_enter(); 1340 1341 td->td_intr_nesting_level++; 1342 oldframe = td->td_intr_frame; 1343 td->td_intr_frame = &frame; 1344 #if defined(STACK) || defined(DDB) 1345 if (ipi_bitmap & (1 << IPI_TRACE)) 1346 stack_capture_intr(); 1347 #endif 1348 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1349 #ifdef COUNT_IPIS 1350 (*ipi_preempt_counts[cpu])++; 1351 #endif 1352 sched_preempt(td); 1353 } 1354 if (ipi_bitmap & (1 << IPI_AST)) { 1355 #ifdef COUNT_IPIS 1356 (*ipi_ast_counts[cpu])++; 1357 #endif 1358 /* Nothing to do for AST */ 1359 } 1360 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1361 #ifdef COUNT_IPIS 1362 (*ipi_hardclock_counts[cpu])++; 1363 #endif 1364 hardclockintr(); 1365 } 1366 td->td_intr_frame = oldframe; 1367 td->td_intr_nesting_level--; 1368 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1369 critical_exit(); 1370 } 1371 1372 /* 1373 * send an IPI to a set of cpus. 1374 */ 1375 void 1376 ipi_selected(cpuset_t cpus, u_int ipi) 1377 { 1378 int cpu; 1379 1380 /* 1381 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1382 * of help in order to understand what is the source. 1383 * Set the mask of receiving CPUs for this purpose. 1384 */ 1385 if (ipi == IPI_STOP_HARD) 1386 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1387 1388 CPU_FOREACH_ISSET(cpu, &cpus) { 1389 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1390 ipi_send_cpu(cpu, ipi); 1391 } 1392 } 1393 1394 /* 1395 * send an IPI to a specific CPU. 1396 */ 1397 void 1398 ipi_cpu(int cpu, u_int ipi) 1399 { 1400 1401 /* 1402 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1403 * of help in order to understand what is the source. 1404 * Set the mask of receiving CPUs for this purpose. 1405 */ 1406 if (ipi == IPI_STOP_HARD) 1407 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1408 1409 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1410 ipi_send_cpu(cpu, ipi); 1411 } 1412 1413 /* 1414 * send an IPI to all CPUs EXCEPT myself 1415 */ 1416 void 1417 ipi_all_but_self(u_int ipi) 1418 { 1419 cpuset_t other_cpus; 1420 int cpu, c; 1421 1422 /* 1423 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1424 * of help in order to understand what is the source. 1425 * Set the mask of receiving CPUs for this purpose. 1426 */ 1427 if (ipi == IPI_STOP_HARD) { 1428 other_cpus = all_cpus; 1429 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1430 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1431 } 1432 1433 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1434 if (IPI_IS_BITMAPED(ipi)) { 1435 cpu = PCPU_GET(cpuid); 1436 CPU_FOREACH(c) { 1437 if (c != cpu) 1438 ipi_bitmap_set(c, ipi); 1439 } 1440 ipi = IPI_BITMAP_VECTOR; 1441 } 1442 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1443 } 1444 1445 void 1446 ipi_self_from_nmi(u_int vector) 1447 { 1448 1449 lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF); 1450 1451 /* Wait for IPI to finish. */ 1452 if (!lapic_ipi_wait(50000)) { 1453 if (KERNEL_PANICKED()) 1454 return; 1455 else 1456 panic("APIC: IPI is stuck"); 1457 } 1458 } 1459 1460 int 1461 ipi_nmi_handler(void) 1462 { 1463 u_int cpuid; 1464 1465 /* 1466 * As long as there is not a simple way to know about a NMI's 1467 * source, if the bitmask for the current CPU is present in 1468 * the global pending bitword an IPI_STOP_HARD has been issued 1469 * and should be handled. 1470 */ 1471 cpuid = PCPU_GET(cpuid); 1472 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1473 return (1); 1474 1475 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1476 cpustop_handler(); 1477 return (0); 1478 } 1479 1480 int nmi_kdb_lock; 1481 1482 void 1483 nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1484 { 1485 int cpu; 1486 bool call_post; 1487 1488 cpu = PCPU_GET(cpuid); 1489 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1490 nmi_call_kdb(cpu, type, frame); 1491 call_post = false; 1492 } else { 1493 savectx(&stoppcbs[cpu]); 1494 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1495 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1496 ia32_pause(); 1497 call_post = true; 1498 } 1499 atomic_store_rel_int(&nmi_kdb_lock, 0); 1500 if (call_post) 1501 cpustop_handler_post(cpu); 1502 } 1503 1504 /* 1505 * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, 1506 * if available) until we are resumed. 1507 */ 1508 void 1509 cpustop_handler(void) 1510 { 1511 struct monitorbuf *mb; 1512 u_int cpu; 1513 bool use_mwait; 1514 1515 cpu = PCPU_GET(cpuid); 1516 1517 savectx(&stoppcbs[cpu]); 1518 1519 use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && 1520 !mwait_cpustop_broken); 1521 if (use_mwait) { 1522 mb = PCPU_PTR(monitorbuf); 1523 atomic_store_int(&mb->stop_state, 1524 MONITOR_STOPSTATE_STOPPED); 1525 } 1526 1527 /* Indicate that we are stopped */ 1528 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1529 1530 /* Wait for restart */ 1531 while (!CPU_ISSET(cpu, &started_cpus)) { 1532 if (use_mwait) { 1533 cpu_monitor(mb, 0, 0); 1534 if (atomic_load_int(&mb->stop_state) == 1535 MONITOR_STOPSTATE_STOPPED) 1536 cpu_mwait(0, MWAIT_C1); 1537 continue; 1538 } 1539 1540 ia32_pause(); 1541 1542 /* 1543 * Halt non-BSP CPUs on panic -- we're never going to need them 1544 * again, and might as well save power / release resources 1545 * (e.g., overprovisioned VM infrastructure). 1546 */ 1547 while (__predict_false(!IS_BSP() && KERNEL_PANICKED())) 1548 halt(); 1549 } 1550 1551 cpustop_handler_post(cpu); 1552 } 1553 1554 static void 1555 cpustop_handler_post(u_int cpu) 1556 { 1557 1558 CPU_CLR_ATOMIC(cpu, &started_cpus); 1559 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1560 1561 /* 1562 * We don't broadcast TLB invalidations to other CPUs when they are 1563 * stopped. Hence, we clear the TLB before resuming. 1564 */ 1565 invltlb_glob(); 1566 1567 #if defined(__amd64__) && (defined(DDB) || defined(GDB)) 1568 amd64_db_resume_dbreg(); 1569 #endif 1570 1571 if (cpu == 0 && cpustop_restartfunc != NULL) { 1572 cpustop_restartfunc(); 1573 cpustop_restartfunc = NULL; 1574 } 1575 } 1576 1577 /* 1578 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1579 * are resumed. 1580 */ 1581 void 1582 cpususpend_handler(void) 1583 { 1584 u_int cpu; 1585 1586 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1587 1588 cpu = PCPU_GET(cpuid); 1589 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1590 #ifdef __amd64__ 1591 fpususpend(susppcbs[cpu]->sp_fpususpend); 1592 #else 1593 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1594 #endif 1595 /* 1596 * suspended_cpus is cleared shortly after each AP is restarted 1597 * by a Startup IPI, so that the BSP can proceed to restarting 1598 * the next AP. 1599 * 1600 * resuming_cpus gets cleared when the AP completes 1601 * initialization after having been released by the BSP. 1602 * resuming_cpus is probably not the best name for the 1603 * variable, because it is actually a set of processors that 1604 * haven't resumed yet and haven't necessarily started resuming. 1605 * 1606 * Note that suspended_cpus is meaningful only for ACPI suspend 1607 * as it's not really used for Xen suspend since the APs are 1608 * automatically restored to the running state and the correct 1609 * context. For the same reason resumectx is never called in 1610 * that case. 1611 */ 1612 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1613 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1614 1615 /* 1616 * Invalidate the cache after setting the global status bits. 1617 * The last AP to set its bit may end up being an Owner of the 1618 * corresponding cache line in MOESI protocol. The AP may be 1619 * stopped before the cache line is written to the main memory. 1620 */ 1621 wbinvd(); 1622 } else { 1623 #ifdef __amd64__ 1624 fpuresume(susppcbs[cpu]->sp_fpususpend); 1625 #else 1626 npxresume(susppcbs[cpu]->sp_fpususpend); 1627 #endif 1628 pmap_init_pat(); 1629 initializecpu(); 1630 PCPU_SET(switchtime, 0); 1631 PCPU_SET(switchticks, ticks); 1632 1633 /* Indicate that we have restarted and restored the context. */ 1634 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1635 } 1636 1637 /* Wait for resume directive */ 1638 while (!CPU_ISSET(cpu, &toresume_cpus)) 1639 ia32_pause(); 1640 1641 /* Re-apply microcode updates. */ 1642 ucode_reload(); 1643 1644 #ifdef __i386__ 1645 /* Finish removing the identity mapping of low memory for this AP. */ 1646 invltlb_glob(); 1647 #endif 1648 1649 if (cpu_ops.cpu_resume) 1650 cpu_ops.cpu_resume(); 1651 #ifdef __amd64__ 1652 if (vmm_resume_p) 1653 vmm_resume_p(); 1654 #endif 1655 1656 /* Resume MCA and local APIC */ 1657 lapic_xapic_mode(); 1658 mca_resume(); 1659 lapic_setup(0); 1660 1661 /* Indicate that we are resumed */ 1662 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1663 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1664 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1665 } 1666 1667 /* 1668 * Handle an IPI_SWI by waking delayed SWI thread. 1669 */ 1670 void 1671 ipi_swi_handler(struct trapframe frame) 1672 { 1673 1674 intr_event_handle(clk_intr_event, &frame); 1675 } 1676 1677 /* 1678 * This is called once the rest of the system is up and running and we're 1679 * ready to let the AP's out of the pen. 1680 */ 1681 static void 1682 release_aps(void *dummy __unused) 1683 { 1684 1685 if (mp_ncpus == 1) 1686 return; 1687 atomic_store_rel_int(&aps_ready, 1); 1688 while (smp_started == 0) 1689 ia32_pause(); 1690 } 1691 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1692 1693 #ifdef COUNT_IPIS 1694 /* 1695 * Setup interrupt counters for IPI handlers. 1696 */ 1697 static void 1698 mp_ipi_intrcnt(void *dummy) 1699 { 1700 char buf[64]; 1701 int i; 1702 1703 CPU_FOREACH(i) { 1704 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1705 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1706 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1707 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1708 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1709 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1710 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1711 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1712 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1713 intrcnt_add(buf, &ipi_preempt_counts[i]); 1714 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1715 intrcnt_add(buf, &ipi_ast_counts[i]); 1716 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1717 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1718 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1719 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1720 } 1721 } 1722 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1723 #endif 1724