1 /*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_acpi.h" 31 #ifdef __i386__ 32 #include "opt_apic.h" 33 #endif 34 #include "opt_cpu.h" 35 #include "opt_ddb.h" 36 #include "opt_gdb.h" 37 #include "opt_kstack_pages.h" 38 #include "opt_pmap.h" 39 #include "opt_sched.h" 40 #include "opt_smp.h" 41 #include "opt_stack.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/asan.h> 46 #include <sys/bus.h> 47 #include <sys/cons.h> /* cngetc() */ 48 #include <sys/cpuset.h> 49 #include <sys/csan.h> 50 #ifdef GPROF 51 #include <sys/gmon.h> 52 #endif 53 #include <sys/interrupt.h> 54 #include <sys/kdb.h> 55 #include <sys/kernel.h> 56 #include <sys/ktr.h> 57 #include <sys/lock.h> 58 #include <sys/malloc.h> 59 #include <sys/memrange.h> 60 #include <sys/mutex.h> 61 #include <sys/pcpu.h> 62 #include <sys/proc.h> 63 #include <sys/sched.h> 64 #include <sys/smp.h> 65 #include <sys/sysctl.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <vm/pmap.h> 70 #include <vm/vm_kern.h> 71 #include <vm/vm_extern.h> 72 #include <vm/vm_map.h> 73 74 #include <x86/apicreg.h> 75 #include <machine/clock.h> 76 #include <machine/cpu.h> 77 #include <machine/cputypes.h> 78 #include <x86/mca.h> 79 #include <machine/md_var.h> 80 #include <machine/pcb.h> 81 #include <machine/psl.h> 82 #include <machine/smp.h> 83 #include <machine/specialreg.h> 84 #include <machine/stack.h> 85 #include <x86/ucode.h> 86 87 #ifdef DEV_ACPI 88 #include <contrib/dev/acpica/include/acpi.h> 89 #include <dev/acpica/acpivar.h> 90 #endif 91 92 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); 93 94 /* lock region used by kernel profiling */ 95 int mcount_lock; 96 97 int mp_naps; /* # of Applications processors */ 98 int boot_cpu_id = -1; /* designated BSP */ 99 100 /* AP uses this during bootstrap. Do not staticize. */ 101 char *bootSTK; 102 int bootAP; 103 104 /* Free these after use */ 105 void *bootstacks[MAXCPU]; 106 void *dpcpu; 107 108 struct pcb stoppcbs[MAXCPU]; 109 struct susppcb **susppcbs; 110 111 #ifdef COUNT_IPIS 112 /* Interrupt counts. */ 113 static u_long *ipi_preempt_counts[MAXCPU]; 114 static u_long *ipi_ast_counts[MAXCPU]; 115 u_long *ipi_invltlb_counts[MAXCPU]; 116 u_long *ipi_invlrng_counts[MAXCPU]; 117 u_long *ipi_invlpg_counts[MAXCPU]; 118 u_long *ipi_invlcache_counts[MAXCPU]; 119 u_long *ipi_rendezvous_counts[MAXCPU]; 120 static u_long *ipi_hardclock_counts[MAXCPU]; 121 #endif 122 123 /* Default cpu_ops implementation. */ 124 struct cpu_ops cpu_ops; 125 126 /* 127 * Local data and functions. 128 */ 129 130 static volatile cpuset_t ipi_stop_nmi_pending; 131 132 volatile cpuset_t resuming_cpus; 133 volatile cpuset_t toresume_cpus; 134 135 /* used to hold the AP's until we are ready to release them */ 136 struct mtx ap_boot_mtx; 137 138 /* Set to 1 once we're ready to let the APs out of the pen. */ 139 volatile int aps_ready = 0; 140 141 /* 142 * Store data from cpu_add() until later in the boot when we actually setup 143 * the APs. 144 */ 145 struct cpu_info *cpu_info; 146 int *apic_cpuids; 147 int cpu_apic_ids[MAXCPU]; 148 _Static_assert(MAXCPU <= MAX_APIC_ID, 149 "MAXCPU cannot be larger that MAX_APIC_ID"); 150 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, 151 "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); 152 153 static void release_aps(void *dummy); 154 static void cpustop_handler_post(u_int cpu); 155 156 static int hyperthreading_allowed = 1; 157 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 158 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 159 160 static int hyperthreading_intr_allowed = 0; 161 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, 162 &hyperthreading_intr_allowed, 0, 163 "Allow interrupts on HTT logical CPUs"); 164 165 static struct topo_node topo_root; 166 167 static int pkg_id_shift; 168 static int node_id_shift; 169 static int core_id_shift; 170 static int disabled_cpus; 171 172 struct cache_info { 173 int id_shift; 174 int present; 175 } static caches[MAX_CACHE_LEVELS]; 176 177 static bool stop_mwait = false; 178 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, 179 "Use MONITOR/MWAIT when stopping CPU, if available"); 180 181 void 182 mem_range_AP_init(void) 183 { 184 185 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 186 mem_range_softc.mr_op->initAP(&mem_range_softc); 187 } 188 189 /* 190 * Round up to the next power of two, if necessary, and then 191 * take log2. 192 * Returns -1 if argument is zero. 193 */ 194 static __inline int 195 mask_width(u_int x) 196 { 197 198 return (fls(x << (1 - powerof2(x))) - 1); 199 } 200 201 /* 202 * Add a cache level to the cache topology description. 203 */ 204 static int 205 add_deterministic_cache(int type, int level, int share_count) 206 { 207 208 if (type == 0) 209 return (0); 210 if (type > 3) { 211 printf("unexpected cache type %d\n", type); 212 return (1); 213 } 214 if (type == 2) /* ignore instruction cache */ 215 return (1); 216 if (level == 0 || level > MAX_CACHE_LEVELS) { 217 printf("unexpected cache level %d\n", type); 218 return (1); 219 } 220 221 if (caches[level - 1].present) { 222 printf("WARNING: multiple entries for L%u data cache\n", level); 223 printf("%u => %u\n", caches[level - 1].id_shift, 224 mask_width(share_count)); 225 } 226 caches[level - 1].id_shift = mask_width(share_count); 227 caches[level - 1].present = 1; 228 229 if (caches[level - 1].id_shift > pkg_id_shift) { 230 printf("WARNING: L%u data cache covers more " 231 "APIC IDs than a package (%u > %u)\n", level, 232 caches[level - 1].id_shift, pkg_id_shift); 233 caches[level - 1].id_shift = pkg_id_shift; 234 } 235 if (caches[level - 1].id_shift < core_id_shift) { 236 printf("WARNING: L%u data cache covers fewer " 237 "APIC IDs than a core (%u < %u)\n", level, 238 caches[level - 1].id_shift, core_id_shift); 239 caches[level - 1].id_shift = core_id_shift; 240 } 241 242 return (1); 243 } 244 245 /* 246 * Determine topology of processing units and caches for AMD CPUs. 247 * See: 248 * - AMD CPUID Specification (Publication # 25481) 249 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 250 * - BKDG For AMD Family 10h Processors (Publication # 31116) 251 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 252 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 253 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) 254 */ 255 static void 256 topo_probe_amd(void) 257 { 258 u_int p[4]; 259 uint64_t v; 260 int level; 261 int nodes_per_socket; 262 int share_count; 263 int type; 264 int i; 265 266 /* No multi-core capability. */ 267 if ((amd_feature2 & AMDID2_CMP) == 0) 268 return; 269 270 /* For families 10h and newer. */ 271 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 272 AMDID_COREID_SIZE_SHIFT; 273 274 /* For 0Fh family. */ 275 if (pkg_id_shift == 0) 276 pkg_id_shift = 277 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 278 279 /* 280 * Families prior to 16h define the following value as 281 * cores per compute unit and we don't really care about the AMD 282 * compute units at the moment. Perhaps we should treat them as 283 * cores and cores within the compute units as hardware threads, 284 * but that's up for debate. 285 * Later families define the value as threads per compute unit, 286 * so we are following AMD's nomenclature here. 287 */ 288 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 289 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 290 cpuid_count(0x8000001e, 0, p); 291 share_count = ((p[1] >> 8) & 0xff) + 1; 292 core_id_shift = mask_width(share_count); 293 294 /* 295 * For Zen (17h), gather Nodes per Processor. Each node is a 296 * Zeppelin die; TR and EPYC CPUs will have multiple dies per 297 * package. Communication latency between dies is higher than 298 * within them. 299 */ 300 nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; 301 node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); 302 } 303 304 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 305 for (i = 0; ; i++) { 306 cpuid_count(0x8000001d, i, p); 307 type = p[0] & 0x1f; 308 level = (p[0] >> 5) & 0x7; 309 share_count = 1 + ((p[0] >> 14) & 0xfff); 310 311 if (!add_deterministic_cache(type, level, share_count)) 312 break; 313 } 314 } else { 315 if (cpu_exthigh >= 0x80000005) { 316 cpuid_count(0x80000005, 0, p); 317 if (((p[2] >> 24) & 0xff) != 0) { 318 caches[0].id_shift = 0; 319 caches[0].present = 1; 320 } 321 } 322 if (cpu_exthigh >= 0x80000006) { 323 cpuid_count(0x80000006, 0, p); 324 if (((p[2] >> 16) & 0xffff) != 0) { 325 caches[1].id_shift = 0; 326 caches[1].present = 1; 327 } 328 if (((p[3] >> 18) & 0x3fff) != 0) { 329 nodes_per_socket = 1; 330 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 331 /* 332 * Handle multi-node processors that 333 * have multiple chips, each with its 334 * own L3 cache, on the same die. 335 */ 336 v = rdmsr(0xc001100c); 337 nodes_per_socket = 1 + ((v >> 3) & 0x7); 338 } 339 caches[2].id_shift = 340 pkg_id_shift - mask_width(nodes_per_socket); 341 caches[2].present = 1; 342 } 343 } 344 } 345 } 346 347 /* 348 * Determine topology of processing units for Intel CPUs 349 * using CPUID Leaf 1 and Leaf 4, if supported. 350 * See: 351 * - Intel 64 Architecture Processor Topology Enumeration 352 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 353 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 354 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 355 */ 356 static void 357 topo_probe_intel_0x4(void) 358 { 359 u_int p[4]; 360 int max_cores; 361 int max_logical; 362 363 /* Both zero and one here mean one logical processor per package. */ 364 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 365 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 366 if (max_logical <= 1) 367 return; 368 369 if (cpu_high >= 0x4) { 370 cpuid_count(0x04, 0, p); 371 max_cores = ((p[0] >> 26) & 0x3f) + 1; 372 } else 373 max_cores = 1; 374 375 core_id_shift = mask_width(max_logical/max_cores); 376 KASSERT(core_id_shift >= 0, 377 ("intel topo: max_cores > max_logical\n")); 378 pkg_id_shift = core_id_shift + mask_width(max_cores); 379 } 380 381 /* 382 * Determine topology of processing units for Intel CPUs 383 * using CPUID Leaf 1Fh or 0Bh, if supported. 384 * See: 385 * - Intel 64 Architecture Processor Topology Enumeration 386 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 387 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 388 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 389 */ 390 static void 391 topo_probe_intel_0xb(void) 392 { 393 u_int leaf; 394 u_int p[4] = { 0 }; 395 int bits; 396 int type; 397 int i; 398 399 /* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */ 400 if (cpu_high >= 0x1f) { 401 leaf = 0x1f; 402 cpuid_count(leaf, 0, p); 403 } 404 /* Fall back to leaf 0Bh (Extended Topology Enumeration). */ 405 if (p[1] == 0) { 406 leaf = 0x0b; 407 cpuid_count(leaf, 0, p); 408 } 409 /* Fall back to leaf 04h (Deterministic Cache Parameters). */ 410 if (p[1] == 0) { 411 topo_probe_intel_0x4(); 412 return; 413 } 414 415 /* We only support three levels for now. */ 416 for (i = 0; ; i++) { 417 cpuid_count(leaf, i, p); 418 419 bits = p[0] & 0x1f; 420 type = (p[2] >> 8) & 0xff; 421 422 if (type == 0) 423 break; 424 425 if (type == CPUID_TYPE_SMT) 426 core_id_shift = bits; 427 else if (type == CPUID_TYPE_CORE) 428 pkg_id_shift = bits; 429 else if (bootverbose) 430 printf("Topology level type %d shift: %d\n", type, bits); 431 } 432 433 if (pkg_id_shift < core_id_shift) { 434 printf("WARNING: core covers more APIC IDs than a package\n"); 435 core_id_shift = pkg_id_shift; 436 } 437 } 438 439 /* 440 * Determine topology of caches for Intel CPUs. 441 * See: 442 * - Intel 64 Architecture Processor Topology Enumeration 443 * - Intel 64 and IA-32 Architectures Software Developer’s Manual 444 * Volume 2A: Instruction Set Reference, A-M, 445 * CPUID instruction 446 */ 447 static void 448 topo_probe_intel_caches(void) 449 { 450 u_int p[4]; 451 int level; 452 int share_count; 453 int type; 454 int i; 455 456 if (cpu_high < 0x4) { 457 /* 458 * Available cache level and sizes can be determined 459 * via CPUID leaf 2, but that requires a huge table of hardcoded 460 * values, so for now just assume L1 and L2 caches potentially 461 * shared only by HTT processing units, if HTT is present. 462 */ 463 caches[0].id_shift = pkg_id_shift; 464 caches[0].present = 1; 465 caches[1].id_shift = pkg_id_shift; 466 caches[1].present = 1; 467 return; 468 } 469 470 for (i = 0; ; i++) { 471 cpuid_count(0x4, i, p); 472 type = p[0] & 0x1f; 473 level = (p[0] >> 5) & 0x7; 474 share_count = 1 + ((p[0] >> 14) & 0xfff); 475 476 if (!add_deterministic_cache(type, level, share_count)) 477 break; 478 } 479 } 480 481 /* 482 * Determine topology of processing units and caches for Intel CPUs. 483 * See: 484 * - Intel 64 Architecture Processor Topology Enumeration 485 */ 486 static void 487 topo_probe_intel(void) 488 { 489 490 /* 491 * Note that 0x1 <= cpu_high < 4 case should be 492 * compatible with topo_probe_intel_0x4() logic when 493 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 494 * or it should trigger the fallback otherwise. 495 */ 496 if (cpu_high >= 0xb) 497 topo_probe_intel_0xb(); 498 else if (cpu_high >= 0x1) 499 topo_probe_intel_0x4(); 500 501 topo_probe_intel_caches(); 502 } 503 504 /* 505 * Topology information is queried only on BSP, on which this 506 * code runs and for which it can query CPUID information. 507 * Then topology is extrapolated on all packages using an 508 * assumption that APIC ID to hardware component ID mapping is 509 * homogenious. 510 * That doesn't necesserily imply that the topology is uniform. 511 */ 512 void 513 topo_probe(void) 514 { 515 static int cpu_topo_probed = 0; 516 struct x86_topo_layer { 517 int type; 518 int subtype; 519 int id_shift; 520 } topo_layers[MAX_CACHE_LEVELS + 5]; 521 struct topo_node *parent; 522 struct topo_node *node; 523 int layer; 524 int nlayers; 525 int node_id; 526 int i; 527 #if defined(DEV_ACPI) && MAXMEMDOM > 1 528 int d, domain; 529 #endif 530 531 if (cpu_topo_probed) 532 return; 533 534 CPU_ZERO(&logical_cpus_mask); 535 536 if (mp_ncpus <= 1) 537 ; /* nothing */ 538 else if (cpu_vendor_id == CPU_VENDOR_AMD || 539 cpu_vendor_id == CPU_VENDOR_HYGON) 540 topo_probe_amd(); 541 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 542 topo_probe_intel(); 543 544 KASSERT(pkg_id_shift >= core_id_shift, 545 ("bug in APIC topology discovery")); 546 547 nlayers = 0; 548 bzero(topo_layers, sizeof(topo_layers)); 549 550 topo_layers[nlayers].type = TOPO_TYPE_PKG; 551 topo_layers[nlayers].id_shift = pkg_id_shift; 552 if (bootverbose) 553 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 554 nlayers++; 555 556 if (pkg_id_shift > node_id_shift && node_id_shift != 0) { 557 topo_layers[nlayers].type = TOPO_TYPE_GROUP; 558 topo_layers[nlayers].id_shift = node_id_shift; 559 if (bootverbose) 560 printf("Node ID shift: %u\n", 561 topo_layers[nlayers].id_shift); 562 nlayers++; 563 } 564 565 /* 566 * Consider all caches to be within a package/chip 567 * and "in front" of all sub-components like 568 * cores and hardware threads. 569 */ 570 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 571 if (caches[i].present) { 572 if (node_id_shift != 0) 573 KASSERT(caches[i].id_shift <= node_id_shift, 574 ("bug in APIC topology discovery")); 575 KASSERT(caches[i].id_shift <= pkg_id_shift, 576 ("bug in APIC topology discovery")); 577 KASSERT(caches[i].id_shift >= core_id_shift, 578 ("bug in APIC topology discovery")); 579 580 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 581 topo_layers[nlayers].subtype = i + 1; 582 topo_layers[nlayers].id_shift = caches[i].id_shift; 583 if (bootverbose) 584 printf("L%u cache ID shift: %u\n", 585 topo_layers[nlayers].subtype, 586 topo_layers[nlayers].id_shift); 587 nlayers++; 588 } 589 } 590 591 if (pkg_id_shift > core_id_shift) { 592 topo_layers[nlayers].type = TOPO_TYPE_CORE; 593 topo_layers[nlayers].id_shift = core_id_shift; 594 if (bootverbose) 595 printf("Core ID shift: %u\n", 596 topo_layers[nlayers].id_shift); 597 nlayers++; 598 } 599 600 topo_layers[nlayers].type = TOPO_TYPE_PU; 601 topo_layers[nlayers].id_shift = 0; 602 nlayers++; 603 604 #if defined(DEV_ACPI) && MAXMEMDOM > 1 605 if (vm_ndomains > 1) { 606 for (layer = 0; layer < nlayers; ++layer) { 607 for (i = 0; i <= max_apic_id; ++i) { 608 if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0) 609 domain = -1; 610 if (!cpu_info[i].cpu_present) 611 continue; 612 d = acpi_pxm_get_cpu_locality(i); 613 if (domain >= 0 && domain != d) 614 break; 615 domain = d; 616 } 617 if (i > max_apic_id) 618 break; 619 } 620 KASSERT(layer < nlayers, ("NUMA domain smaller than PU")); 621 memmove(&topo_layers[layer+1], &topo_layers[layer], 622 sizeof(*topo_layers) * (nlayers - layer)); 623 topo_layers[layer].type = TOPO_TYPE_NODE; 624 topo_layers[layer].subtype = CG_SHARE_NONE; 625 nlayers++; 626 } 627 #endif 628 629 topo_init_root(&topo_root); 630 for (i = 0; i <= max_apic_id; ++i) { 631 if (!cpu_info[i].cpu_present) 632 continue; 633 634 parent = &topo_root; 635 for (layer = 0; layer < nlayers; ++layer) { 636 #if defined(DEV_ACPI) && MAXMEMDOM > 1 637 if (topo_layers[layer].type == TOPO_TYPE_NODE) { 638 node_id = acpi_pxm_get_cpu_locality(i); 639 } else 640 #endif 641 node_id = i >> topo_layers[layer].id_shift; 642 parent = topo_add_node_by_hwid(parent, node_id, 643 topo_layers[layer].type, 644 topo_layers[layer].subtype); 645 } 646 } 647 648 parent = &topo_root; 649 for (layer = 0; layer < nlayers; ++layer) { 650 #if defined(DEV_ACPI) && MAXMEMDOM > 1 651 if (topo_layers[layer].type == TOPO_TYPE_NODE) 652 node_id = acpi_pxm_get_cpu_locality(boot_cpu_id); 653 else 654 #endif 655 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 656 node = topo_find_node_by_hwid(parent, node_id, 657 topo_layers[layer].type, 658 topo_layers[layer].subtype); 659 topo_promote_child(node); 660 parent = node; 661 } 662 663 cpu_topo_probed = 1; 664 } 665 666 /* 667 * Assign logical CPU IDs to local APICs. 668 */ 669 void 670 assign_cpu_ids(void) 671 { 672 struct topo_node *node; 673 u_int smt_mask; 674 int nhyper; 675 676 smt_mask = (1u << core_id_shift) - 1; 677 678 /* 679 * Assign CPU IDs to local APIC IDs and disable any CPUs 680 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 681 */ 682 mp_ncpus = 0; 683 nhyper = 0; 684 TOPO_FOREACH(node, &topo_root) { 685 if (node->type != TOPO_TYPE_PU) 686 continue; 687 688 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 689 cpu_info[node->hwid].cpu_hyperthread = 1; 690 691 if (resource_disabled("lapic", node->hwid)) { 692 if (node->hwid != boot_cpu_id) 693 cpu_info[node->hwid].cpu_disabled = 1; 694 else 695 printf("Cannot disable BSP, APIC ID = %d\n", 696 node->hwid); 697 } 698 699 if (!hyperthreading_allowed && 700 cpu_info[node->hwid].cpu_hyperthread) 701 cpu_info[node->hwid].cpu_disabled = 1; 702 703 if (mp_ncpus >= MAXCPU) 704 cpu_info[node->hwid].cpu_disabled = 1; 705 706 if (cpu_info[node->hwid].cpu_disabled) { 707 disabled_cpus++; 708 continue; 709 } 710 711 if (cpu_info[node->hwid].cpu_hyperthread) 712 nhyper++; 713 714 cpu_apic_ids[mp_ncpus] = node->hwid; 715 apic_cpuids[node->hwid] = mp_ncpus; 716 topo_set_pu_id(node, mp_ncpus); 717 mp_ncpus++; 718 } 719 720 KASSERT(mp_maxid >= mp_ncpus - 1, 721 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 722 mp_ncpus)); 723 724 mp_ncores = mp_ncpus - nhyper; 725 smp_threads_per_core = mp_ncpus / mp_ncores; 726 } 727 728 /* 729 * Print various information about the SMP system hardware and setup. 730 */ 731 void 732 cpu_mp_announce(void) 733 { 734 struct topo_node *node; 735 const char *hyperthread; 736 struct topo_analysis topology; 737 738 printf("FreeBSD/SMP: "); 739 if (topo_analyze(&topo_root, 1, &topology)) { 740 printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); 741 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 742 printf(" x %d groups", 743 topology.entities[TOPO_LEVEL_GROUP]); 744 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 745 printf(" x %d cache groups", 746 topology.entities[TOPO_LEVEL_CACHEGROUP]); 747 if (topology.entities[TOPO_LEVEL_CORE] > 0) 748 printf(" x %d core(s)", 749 topology.entities[TOPO_LEVEL_CORE]); 750 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 751 printf(" x %d hardware threads", 752 topology.entities[TOPO_LEVEL_THREAD]); 753 } else { 754 printf("Non-uniform topology"); 755 } 756 printf("\n"); 757 758 if (disabled_cpus) { 759 printf("FreeBSD/SMP Online: "); 760 if (topo_analyze(&topo_root, 0, &topology)) { 761 printf("%d package(s)", 762 topology.entities[TOPO_LEVEL_PKG]); 763 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 764 printf(" x %d groups", 765 topology.entities[TOPO_LEVEL_GROUP]); 766 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 767 printf(" x %d cache groups", 768 topology.entities[TOPO_LEVEL_CACHEGROUP]); 769 if (topology.entities[TOPO_LEVEL_CORE] > 0) 770 printf(" x %d core(s)", 771 topology.entities[TOPO_LEVEL_CORE]); 772 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 773 printf(" x %d hardware threads", 774 topology.entities[TOPO_LEVEL_THREAD]); 775 } else { 776 printf("Non-uniform topology"); 777 } 778 printf("\n"); 779 } 780 781 if (!bootverbose) 782 return; 783 784 TOPO_FOREACH(node, &topo_root) { 785 switch (node->type) { 786 case TOPO_TYPE_PKG: 787 printf("Package HW ID = %u\n", node->hwid); 788 break; 789 case TOPO_TYPE_CORE: 790 printf("\tCore HW ID = %u\n", node->hwid); 791 break; 792 case TOPO_TYPE_PU: 793 if (cpu_info[node->hwid].cpu_hyperthread) 794 hyperthread = "/HT"; 795 else 796 hyperthread = ""; 797 798 if (node->subtype == 0) 799 printf("\t\tCPU (AP%s): APIC ID: %u" 800 "(disabled)\n", hyperthread, node->hwid); 801 else if (node->id == 0) 802 printf("\t\tCPU0 (BSP): APIC ID: %u\n", 803 node->hwid); 804 else 805 printf("\t\tCPU%u (AP%s): APIC ID: %u\n", 806 node->id, hyperthread, node->hwid); 807 break; 808 default: 809 /* ignored */ 810 break; 811 } 812 } 813 } 814 815 /* 816 * Add a scheduling group, a group of logical processors sharing 817 * a particular cache (and, thus having an affinity), to the scheduling 818 * topology. 819 * This function recursively works on lower level caches. 820 */ 821 static void 822 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 823 { 824 struct topo_node *node; 825 int nchildren; 826 int ncores; 827 int i; 828 829 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || 830 root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP, 831 ("x86topo_add_sched_group: bad type: %u", root->type)); 832 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 833 cg_root->cg_count = root->cpu_count; 834 if (root->type == TOPO_TYPE_CACHE) 835 cg_root->cg_level = root->subtype; 836 else 837 cg_root->cg_level = CG_SHARE_NONE; 838 if (root->type == TOPO_TYPE_NODE) 839 cg_root->cg_flags = CG_FLAG_NODE; 840 else 841 cg_root->cg_flags = 0; 842 843 /* 844 * Check how many core nodes we have under the given root node. 845 * If we have multiple logical processors, but not multiple 846 * cores, then those processors must be hardware threads. 847 */ 848 ncores = 0; 849 node = root; 850 while (node != NULL) { 851 if (node->type != TOPO_TYPE_CORE) { 852 node = topo_next_node(root, node); 853 continue; 854 } 855 856 ncores++; 857 node = topo_next_nonchild_node(root, node); 858 } 859 860 if (cg_root->cg_level != CG_SHARE_NONE && 861 root->cpu_count > 1 && ncores < 2) 862 cg_root->cg_flags |= CG_FLAG_SMT; 863 864 /* 865 * Find out how many cache nodes we have under the given root node. 866 * We ignore cache nodes that cover all the same processors as the 867 * root node. Also, we do not descend below found cache nodes. 868 * That is, we count top-level "non-redundant" caches under the root 869 * node. 870 */ 871 nchildren = 0; 872 node = root; 873 while (node != NULL) { 874 if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) { 875 if (node->type == TOPO_TYPE_CACHE && 876 cg_root->cg_level < node->subtype) 877 cg_root->cg_level = node->subtype; 878 if (node->type == TOPO_TYPE_NODE) 879 cg_root->cg_flags |= CG_FLAG_NODE; 880 node = topo_next_node(root, node); 881 continue; 882 } 883 if (node->type != TOPO_TYPE_GROUP && 884 node->type != TOPO_TYPE_NODE && 885 node->type != TOPO_TYPE_CACHE) { 886 node = topo_next_node(root, node); 887 continue; 888 } 889 nchildren++; 890 node = topo_next_nonchild_node(root, node); 891 } 892 893 /* 894 * We are not interested in nodes including only one CPU each. 895 */ 896 if (nchildren == root->cpu_count) 897 return; 898 899 cg_root->cg_child = smp_topo_alloc(nchildren); 900 cg_root->cg_children = nchildren; 901 902 /* 903 * Now find again the same cache nodes as above and recursively 904 * build scheduling topologies for them. 905 */ 906 node = root; 907 i = 0; 908 while (node != NULL) { 909 if ((node->type != TOPO_TYPE_GROUP && 910 node->type != TOPO_TYPE_NODE && 911 node->type != TOPO_TYPE_CACHE) || 912 CPU_CMP(&node->cpuset, &root->cpuset) == 0) { 913 node = topo_next_node(root, node); 914 continue; 915 } 916 cg_root->cg_child[i].cg_parent = cg_root; 917 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 918 i++; 919 node = topo_next_nonchild_node(root, node); 920 } 921 } 922 923 /* 924 * Build the MI scheduling topology from the discovered hardware topology. 925 */ 926 struct cpu_group * 927 cpu_topo(void) 928 { 929 struct cpu_group *cg_root; 930 931 if (mp_ncpus <= 1) 932 return (smp_topo_none()); 933 934 cg_root = smp_topo_alloc(1); 935 x86topo_add_sched_group(&topo_root, cg_root); 936 return (cg_root); 937 } 938 939 static void 940 cpu_alloc(void *dummy __unused) 941 { 942 /* 943 * Dynamically allocate the arrays that depend on the 944 * maximum APIC ID. 945 */ 946 cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, 947 M_WAITOK | M_ZERO); 948 apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, 949 M_WAITOK | M_ZERO); 950 } 951 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); 952 953 /* 954 * Add a logical CPU to the topology. 955 */ 956 void 957 cpu_add(u_int apic_id, char boot_cpu) 958 { 959 960 if (apic_id > max_apic_id) { 961 panic("SMP: APIC ID %d too high", apic_id); 962 return; 963 } 964 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", 965 apic_id)); 966 cpu_info[apic_id].cpu_present = 1; 967 if (boot_cpu) { 968 KASSERT(boot_cpu_id == -1, 969 ("CPU %u claims to be BSP, but CPU %u already is", apic_id, 970 boot_cpu_id)); 971 boot_cpu_id = apic_id; 972 cpu_info[apic_id].cpu_bsp = 1; 973 } 974 if (bootverbose) 975 printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : 976 "AP"); 977 } 978 979 void 980 cpu_mp_setmaxid(void) 981 { 982 983 /* 984 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 985 * If there were no calls to cpu_add() assume this is a UP system. 986 */ 987 if (mp_ncpus == 0) 988 mp_ncpus = 1; 989 } 990 991 int 992 cpu_mp_probe(void) 993 { 994 995 /* 996 * Always record BSP in CPU map so that the mbuf init code works 997 * correctly. 998 */ 999 CPU_SETOF(0, &all_cpus); 1000 return (mp_ncpus > 1); 1001 } 1002 1003 /* 1004 * AP CPU's call this to initialize themselves. 1005 */ 1006 void 1007 init_secondary_tail(void) 1008 { 1009 u_int cpuid; 1010 1011 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); 1012 1013 /* 1014 * On real hardware, switch to x2apic mode if possible. Do it 1015 * after aps_ready was signalled, to avoid manipulating the 1016 * mode while BSP might still want to send some IPI to us 1017 * (second startup IPI is ignored on modern hardware etc). 1018 */ 1019 lapic_xapic_mode(); 1020 1021 /* Initialize the PAT MSR. */ 1022 pmap_init_pat(); 1023 1024 /* set up CPU registers and state */ 1025 cpu_setregs(); 1026 1027 /* set up SSE/NX */ 1028 initializecpu(); 1029 1030 /* set up FPU state on the AP */ 1031 #ifdef __amd64__ 1032 fpuinit(); 1033 #else 1034 npxinit(false); 1035 #endif 1036 1037 if (cpu_ops.cpu_init) 1038 cpu_ops.cpu_init(); 1039 1040 /* A quick check from sanity claus */ 1041 cpuid = PCPU_GET(cpuid); 1042 if (PCPU_GET(apic_id) != lapic_id()) { 1043 printf("SMP: cpuid = %d\n", cpuid); 1044 printf("SMP: actual apic_id = %d\n", lapic_id()); 1045 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 1046 panic("cpuid mismatch! boom!!"); 1047 } 1048 1049 /* Initialize curthread. */ 1050 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 1051 PCPU_SET(curthread, PCPU_GET(idlethread)); 1052 schedinit_ap(); 1053 1054 mtx_lock_spin(&ap_boot_mtx); 1055 1056 mca_init(); 1057 1058 /* Init local apic for irq's */ 1059 lapic_setup(1); 1060 1061 /* Set memory range attributes for this CPU to match the BSP */ 1062 mem_range_AP_init(); 1063 1064 smp_cpus++; 1065 1066 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 1067 if (bootverbose) 1068 printf("SMP: AP CPU #%d Launched!\n", cpuid); 1069 else 1070 printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", 1071 cpuid, smp_cpus == mp_ncpus ? "\n" : " "); 1072 1073 /* Determine if we are a logical CPU. */ 1074 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 1075 CPU_SET(cpuid, &logical_cpus_mask); 1076 1077 if (bootverbose) 1078 lapic_dump("AP"); 1079 1080 if (smp_cpus == mp_ncpus) { 1081 /* enable IPI's, tlb shootdown, freezes etc */ 1082 atomic_store_rel_int(&smp_started, 1); 1083 } 1084 1085 #ifdef __amd64__ 1086 if (pmap_pcid_enabled) 1087 load_cr4(rcr4() | CR4_PCIDE); 1088 load_ds(_udatasel); 1089 load_es(_udatasel); 1090 load_fs(_ufssel); 1091 #endif 1092 1093 mtx_unlock_spin(&ap_boot_mtx); 1094 1095 /* Wait until all the AP's are up. */ 1096 while (atomic_load_acq_int(&smp_started) == 0) 1097 ia32_pause(); 1098 1099 #ifndef EARLY_AP_STARTUP 1100 /* Start per-CPU event timers. */ 1101 cpu_initclocks_ap(); 1102 #endif 1103 1104 kcsan_cpu_init(cpuid); 1105 1106 /* 1107 * Assert that smp_after_idle_runnable condition is reasonable. 1108 */ 1109 MPASS(PCPU_GET(curpcb) == NULL); 1110 1111 sched_ap_entry(); 1112 1113 panic("scheduler returned us to %s", __func__); 1114 /* NOTREACHED */ 1115 } 1116 1117 static void 1118 smp_after_idle_runnable(void *arg __unused) 1119 { 1120 struct pcpu *pc; 1121 int cpu; 1122 1123 for (cpu = 1; cpu < mp_ncpus; cpu++) { 1124 pc = pcpu_find(cpu); 1125 while (atomic_load_ptr(&pc->pc_curpcb) == NULL) 1126 cpu_spinwait(); 1127 kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages * 1128 PAGE_SIZE); 1129 } 1130 } 1131 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, 1132 smp_after_idle_runnable, NULL); 1133 1134 /* 1135 * We tell the I/O APIC code about all the CPUs we want to receive 1136 * interrupts. If we don't want certain CPUs to receive IRQs we 1137 * can simply not tell the I/O APIC code about them in this function. 1138 * We also do not tell it about the BSP since it tells itself about 1139 * the BSP internally to work with UP kernels and on UP machines. 1140 */ 1141 void 1142 set_interrupt_apic_ids(void) 1143 { 1144 u_int i, apic_id; 1145 1146 for (i = 0; i < MAXCPU; i++) { 1147 apic_id = cpu_apic_ids[i]; 1148 if (apic_id == -1) 1149 continue; 1150 if (cpu_info[apic_id].cpu_bsp) 1151 continue; 1152 if (cpu_info[apic_id].cpu_disabled) 1153 continue; 1154 1155 /* Don't let hyperthreads service interrupts. */ 1156 if (cpu_info[apic_id].cpu_hyperthread && 1157 !hyperthreading_intr_allowed) 1158 continue; 1159 1160 intr_add_cpu(i); 1161 } 1162 } 1163 1164 #ifdef COUNT_XINVLTLB_HITS 1165 u_int xhits_gbl[MAXCPU]; 1166 u_int xhits_pg[MAXCPU]; 1167 u_int xhits_rng[MAXCPU]; 1168 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1169 ""); 1170 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1171 sizeof(xhits_gbl), "IU", ""); 1172 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1173 sizeof(xhits_pg), "IU", ""); 1174 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1175 sizeof(xhits_rng), "IU", ""); 1176 1177 u_int ipi_global; 1178 u_int ipi_page; 1179 u_int ipi_range; 1180 u_int ipi_range_size; 1181 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1182 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1183 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1184 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1185 0, ""); 1186 #endif /* COUNT_XINVLTLB_HITS */ 1187 1188 /* 1189 * Init and startup IPI. 1190 */ 1191 void 1192 ipi_startup(int apic_id, int vector) 1193 { 1194 1195 /* 1196 * This attempts to follow the algorithm described in the 1197 * Intel Multiprocessor Specification v1.4 in section B.4. 1198 * For each IPI, we allow the local APIC ~20us to deliver the 1199 * IPI. If that times out, we panic. 1200 */ 1201 1202 /* 1203 * first we do an INIT IPI: this INIT IPI might be run, resetting 1204 * and running the target CPU. OR this INIT IPI might be latched (P5 1205 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1206 * ignored. 1207 */ 1208 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1209 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1210 lapic_ipi_wait(100); 1211 1212 /* Explicitly deassert the INIT IPI. */ 1213 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1214 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1215 apic_id); 1216 1217 DELAY(10000); /* wait ~10mS */ 1218 1219 /* 1220 * next we do a STARTUP IPI: the previous INIT IPI might still be 1221 * latched, (P5 bug) this 1st STARTUP would then terminate 1222 * immediately, and the previously started INIT IPI would continue. OR 1223 * the previous INIT IPI has already run. and this STARTUP IPI will 1224 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1225 * will run. 1226 */ 1227 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1228 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1229 vector, apic_id); 1230 if (!lapic_ipi_wait(100)) 1231 panic("Failed to deliver first STARTUP IPI to APIC %d", 1232 apic_id); 1233 DELAY(200); /* wait ~200uS */ 1234 1235 /* 1236 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1237 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1238 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1239 * recognized after hardware RESET or INIT IPI. 1240 */ 1241 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1242 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1243 vector, apic_id); 1244 if (!lapic_ipi_wait(100)) 1245 panic("Failed to deliver second STARTUP IPI to APIC %d", 1246 apic_id); 1247 1248 DELAY(200); /* wait ~200uS */ 1249 } 1250 1251 static bool 1252 ipi_bitmap_set(int cpu, u_int ipi) 1253 { 1254 u_int bitmap, old, new; 1255 u_int *cpu_bitmap; 1256 1257 bitmap = 1 << ipi; 1258 cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap; 1259 old = *cpu_bitmap; 1260 for (;;) { 1261 if ((old & bitmap) != 0) 1262 break; 1263 new = old | bitmap; 1264 if (atomic_fcmpset_int(cpu_bitmap, &old, new)) 1265 break; 1266 } 1267 return (old != 0); 1268 } 1269 1270 /* 1271 * Send an IPI to specified CPU handling the bitmap logic. 1272 */ 1273 static void 1274 ipi_send_cpu(int cpu, u_int ipi) 1275 { 1276 1277 KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1, 1278 ("IPI to non-existent CPU %d", cpu)); 1279 1280 if (IPI_IS_BITMAPED(ipi)) { 1281 if (ipi_bitmap_set(cpu, ipi)) 1282 return; 1283 ipi = IPI_BITMAP_VECTOR; 1284 } 1285 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1286 } 1287 1288 void 1289 ipi_bitmap_handler(struct trapframe frame) 1290 { 1291 struct trapframe *oldframe; 1292 struct thread *td; 1293 int cpu = PCPU_GET(cpuid); 1294 u_int ipi_bitmap; 1295 1296 kasan_mark(&frame, sizeof(frame), sizeof(frame), 0); 1297 1298 td = curthread; 1299 ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]-> 1300 pc_ipi_bitmap); 1301 1302 /* 1303 * sched_preempt() must be called to clear the pending preempt 1304 * IPI to enable delivery of further preempts. However, the 1305 * critical section will cause extra scheduler lock thrashing 1306 * when used unconditionally. Only critical_enter() if 1307 * hardclock must also run, which requires the section entry. 1308 */ 1309 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1310 critical_enter(); 1311 1312 td->td_intr_nesting_level++; 1313 oldframe = td->td_intr_frame; 1314 td->td_intr_frame = &frame; 1315 #if defined(STACK) || defined(DDB) 1316 if (ipi_bitmap & (1 << IPI_TRACE)) 1317 stack_capture_intr(); 1318 #endif 1319 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1320 #ifdef COUNT_IPIS 1321 (*ipi_preempt_counts[cpu])++; 1322 #endif 1323 sched_preempt(td); 1324 } 1325 if (ipi_bitmap & (1 << IPI_AST)) { 1326 #ifdef COUNT_IPIS 1327 (*ipi_ast_counts[cpu])++; 1328 #endif 1329 /* Nothing to do for AST */ 1330 } 1331 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1332 #ifdef COUNT_IPIS 1333 (*ipi_hardclock_counts[cpu])++; 1334 #endif 1335 hardclockintr(); 1336 } 1337 td->td_intr_frame = oldframe; 1338 td->td_intr_nesting_level--; 1339 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1340 critical_exit(); 1341 } 1342 1343 /* 1344 * send an IPI to a set of cpus. 1345 */ 1346 void 1347 ipi_selected(cpuset_t cpus, u_int ipi) 1348 { 1349 int cpu; 1350 1351 /* 1352 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1353 * of help in order to understand what is the source. 1354 * Set the mask of receiving CPUs for this purpose. 1355 */ 1356 if (ipi == IPI_STOP_HARD) 1357 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1358 1359 CPU_FOREACH_ISSET(cpu, &cpus) { 1360 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1361 ipi_send_cpu(cpu, ipi); 1362 } 1363 } 1364 1365 /* 1366 * send an IPI to a specific CPU. 1367 */ 1368 void 1369 ipi_cpu(int cpu, u_int ipi) 1370 { 1371 1372 /* 1373 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1374 * of help in order to understand what is the source. 1375 * Set the mask of receiving CPUs for this purpose. 1376 */ 1377 if (ipi == IPI_STOP_HARD) 1378 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1379 1380 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1381 ipi_send_cpu(cpu, ipi); 1382 } 1383 1384 /* 1385 * send an IPI to all CPUs EXCEPT myself 1386 */ 1387 void 1388 ipi_all_but_self(u_int ipi) 1389 { 1390 cpuset_t other_cpus; 1391 int cpu, c; 1392 1393 /* 1394 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1395 * of help in order to understand what is the source. 1396 * Set the mask of receiving CPUs for this purpose. 1397 */ 1398 if (ipi == IPI_STOP_HARD) { 1399 other_cpus = all_cpus; 1400 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1401 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1402 } 1403 1404 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1405 if (IPI_IS_BITMAPED(ipi)) { 1406 cpu = PCPU_GET(cpuid); 1407 CPU_FOREACH(c) { 1408 if (c != cpu) 1409 ipi_bitmap_set(c, ipi); 1410 } 1411 ipi = IPI_BITMAP_VECTOR; 1412 } 1413 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1414 } 1415 1416 void 1417 ipi_self_from_nmi(u_int vector) 1418 { 1419 1420 lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF); 1421 1422 /* Wait for IPI to finish. */ 1423 if (!lapic_ipi_wait(50000)) { 1424 if (KERNEL_PANICKED()) 1425 return; 1426 else 1427 panic("APIC: IPI is stuck"); 1428 } 1429 } 1430 1431 int 1432 ipi_nmi_handler(void) 1433 { 1434 u_int cpuid; 1435 1436 /* 1437 * As long as there is not a simple way to know about a NMI's 1438 * source, if the bitmask for the current CPU is present in 1439 * the global pending bitword an IPI_STOP_HARD has been issued 1440 * and should be handled. 1441 */ 1442 cpuid = PCPU_GET(cpuid); 1443 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1444 return (1); 1445 1446 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1447 cpustop_handler(); 1448 return (0); 1449 } 1450 1451 int nmi_kdb_lock; 1452 1453 void 1454 nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1455 { 1456 int cpu; 1457 bool call_post; 1458 1459 cpu = PCPU_GET(cpuid); 1460 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1461 nmi_call_kdb(cpu, type, frame); 1462 call_post = false; 1463 } else { 1464 savectx(&stoppcbs[cpu]); 1465 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1466 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1467 ia32_pause(); 1468 call_post = true; 1469 } 1470 atomic_store_rel_int(&nmi_kdb_lock, 0); 1471 if (call_post) 1472 cpustop_handler_post(cpu); 1473 } 1474 1475 /* 1476 * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, 1477 * if available) until we are resumed. 1478 */ 1479 void 1480 cpustop_handler(void) 1481 { 1482 struct monitorbuf *mb; 1483 u_int cpu; 1484 bool use_mwait; 1485 1486 cpu = PCPU_GET(cpuid); 1487 1488 savectx(&stoppcbs[cpu]); 1489 1490 use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && 1491 !mwait_cpustop_broken); 1492 if (use_mwait) { 1493 mb = PCPU_PTR(monitorbuf); 1494 atomic_store_int(&mb->stop_state, 1495 MONITOR_STOPSTATE_STOPPED); 1496 } 1497 1498 /* Indicate that we are stopped */ 1499 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1500 1501 /* Wait for restart */ 1502 while (!CPU_ISSET(cpu, &started_cpus)) { 1503 if (use_mwait) { 1504 cpu_monitor(mb, 0, 0); 1505 if (atomic_load_int(&mb->stop_state) == 1506 MONITOR_STOPSTATE_STOPPED) 1507 cpu_mwait(0, MWAIT_C1); 1508 continue; 1509 } 1510 1511 ia32_pause(); 1512 1513 /* 1514 * Halt non-BSP CPUs on panic -- we're never going to need them 1515 * again, and might as well save power / release resources 1516 * (e.g., overprovisioned VM infrastructure). 1517 */ 1518 while (__predict_false(!IS_BSP() && KERNEL_PANICKED())) 1519 halt(); 1520 } 1521 1522 cpustop_handler_post(cpu); 1523 } 1524 1525 static void 1526 cpustop_handler_post(u_int cpu) 1527 { 1528 1529 CPU_CLR_ATOMIC(cpu, &started_cpus); 1530 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1531 1532 /* 1533 * We don't broadcast TLB invalidations to other CPUs when they are 1534 * stopped. Hence, we clear the TLB before resuming. 1535 */ 1536 invltlb_glob(); 1537 1538 #if defined(__amd64__) && (defined(DDB) || defined(GDB)) 1539 amd64_db_resume_dbreg(); 1540 #endif 1541 1542 if (cpu == 0 && cpustop_restartfunc != NULL) { 1543 cpustop_restartfunc(); 1544 cpustop_restartfunc = NULL; 1545 } 1546 } 1547 1548 /* 1549 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1550 * are resumed. 1551 */ 1552 void 1553 cpususpend_handler(void) 1554 { 1555 u_int cpu; 1556 1557 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1558 1559 cpu = PCPU_GET(cpuid); 1560 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1561 #ifdef __amd64__ 1562 fpususpend(susppcbs[cpu]->sp_fpususpend); 1563 #else 1564 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1565 #endif 1566 /* 1567 * suspended_cpus is cleared shortly after each AP is restarted 1568 * by a Startup IPI, so that the BSP can proceed to restarting 1569 * the next AP. 1570 * 1571 * resuming_cpus gets cleared when the AP completes 1572 * initialization after having been released by the BSP. 1573 * resuming_cpus is probably not the best name for the 1574 * variable, because it is actually a set of processors that 1575 * haven't resumed yet and haven't necessarily started resuming. 1576 * 1577 * Note that suspended_cpus is meaningful only for ACPI suspend 1578 * as it's not really used for Xen suspend since the APs are 1579 * automatically restored to the running state and the correct 1580 * context. For the same reason resumectx is never called in 1581 * that case. 1582 */ 1583 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1584 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1585 1586 /* 1587 * Invalidate the cache after setting the global status bits. 1588 * The last AP to set its bit may end up being an Owner of the 1589 * corresponding cache line in MOESI protocol. The AP may be 1590 * stopped before the cache line is written to the main memory. 1591 */ 1592 wbinvd(); 1593 } else { 1594 #ifdef __amd64__ 1595 fpuresume(susppcbs[cpu]->sp_fpususpend); 1596 #else 1597 npxresume(susppcbs[cpu]->sp_fpususpend); 1598 #endif 1599 pmap_init_pat(); 1600 initializecpu(); 1601 PCPU_SET(switchtime, 0); 1602 PCPU_SET(switchticks, ticks); 1603 1604 /* Indicate that we have restarted and restored the context. */ 1605 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1606 } 1607 1608 /* Wait for resume directive */ 1609 while (!CPU_ISSET(cpu, &toresume_cpus)) 1610 ia32_pause(); 1611 1612 /* Re-apply microcode updates. */ 1613 ucode_reload(); 1614 1615 #ifdef __i386__ 1616 /* Finish removing the identity mapping of low memory for this AP. */ 1617 invltlb_glob(); 1618 #endif 1619 1620 if (cpu_ops.cpu_resume) 1621 cpu_ops.cpu_resume(); 1622 #ifdef __amd64__ 1623 if (vmm_resume_p) 1624 vmm_resume_p(); 1625 #endif 1626 1627 /* Resume MCA and local APIC */ 1628 lapic_xapic_mode(); 1629 mca_resume(); 1630 lapic_setup(0); 1631 1632 /* Indicate that we are resumed */ 1633 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1634 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1635 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1636 } 1637 1638 /* 1639 * Handle an IPI_SWI by waking delayed SWI thread. 1640 */ 1641 void 1642 ipi_swi_handler(struct trapframe frame) 1643 { 1644 1645 intr_event_handle(clk_intr_event, &frame); 1646 } 1647 1648 /* 1649 * This is called once the rest of the system is up and running and we're 1650 * ready to let the AP's out of the pen. 1651 */ 1652 static void 1653 release_aps(void *dummy __unused) 1654 { 1655 1656 if (mp_ncpus == 1) 1657 return; 1658 atomic_store_rel_int(&aps_ready, 1); 1659 while (smp_started == 0) 1660 ia32_pause(); 1661 } 1662 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1663 1664 #ifdef COUNT_IPIS 1665 /* 1666 * Setup interrupt counters for IPI handlers. 1667 */ 1668 static void 1669 mp_ipi_intrcnt(void *dummy) 1670 { 1671 char buf[64]; 1672 int i; 1673 1674 CPU_FOREACH(i) { 1675 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1676 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1677 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1678 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1679 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1680 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1681 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1682 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1683 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1684 intrcnt_add(buf, &ipi_preempt_counts[i]); 1685 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1686 intrcnt_add(buf, &ipi_ast_counts[i]); 1687 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1688 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1689 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1690 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1691 } 1692 } 1693 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1694 #endif 1695