1 /*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #ifdef __i386__ 31 #include "opt_apic.h" 32 #endif 33 #include "opt_cpu.h" 34 #include "opt_ddb.h" 35 #include "opt_kstack_pages.h" 36 #include "opt_pmap.h" 37 #include "opt_sched.h" 38 #include "opt_smp.h" 39 #include "opt_stack.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/bus.h> 44 #include <sys/cons.h> /* cngetc() */ 45 #include <sys/cpuset.h> 46 #include <sys/csan.h> 47 #ifdef GPROF 48 #include <sys/gmon.h> 49 #endif 50 #include <sys/interrupt.h> 51 #include <sys/kdb.h> 52 #include <sys/kernel.h> 53 #include <sys/ktr.h> 54 #include <sys/lock.h> 55 #include <sys/malloc.h> 56 #include <sys/memrange.h> 57 #include <sys/mutex.h> 58 #include <sys/pcpu.h> 59 #include <sys/proc.h> 60 #include <sys/sched.h> 61 #include <sys/smp.h> 62 #include <sys/sysctl.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_param.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_kern.h> 68 #include <vm/vm_extern.h> 69 #include <vm/vm_map.h> 70 71 #include <x86/apicreg.h> 72 #include <machine/clock.h> 73 #include <machine/cpu.h> 74 #include <machine/cputypes.h> 75 #include <x86/mca.h> 76 #include <machine/md_var.h> 77 #include <machine/pcb.h> 78 #include <machine/psl.h> 79 #include <machine/smp.h> 80 #include <machine/specialreg.h> 81 #include <machine/stack.h> 82 #include <x86/ucode.h> 83 84 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); 85 86 /* lock region used by kernel profiling */ 87 int mcount_lock; 88 89 int mp_naps; /* # of Applications processors */ 90 int boot_cpu_id = -1; /* designated BSP */ 91 92 /* AP uses this during bootstrap. Do not staticize. */ 93 char *bootSTK; 94 int bootAP; 95 96 /* Free these after use */ 97 void *bootstacks[MAXCPU]; 98 void *dpcpu; 99 100 struct pcb stoppcbs[MAXCPU]; 101 struct susppcb **susppcbs; 102 103 #ifdef COUNT_IPIS 104 /* Interrupt counts. */ 105 static u_long *ipi_preempt_counts[MAXCPU]; 106 static u_long *ipi_ast_counts[MAXCPU]; 107 u_long *ipi_invltlb_counts[MAXCPU]; 108 u_long *ipi_invlrng_counts[MAXCPU]; 109 u_long *ipi_invlpg_counts[MAXCPU]; 110 u_long *ipi_invlcache_counts[MAXCPU]; 111 u_long *ipi_rendezvous_counts[MAXCPU]; 112 static u_long *ipi_hardclock_counts[MAXCPU]; 113 #endif 114 115 /* Default cpu_ops implementation. */ 116 struct cpu_ops cpu_ops; 117 118 /* 119 * Local data and functions. 120 */ 121 122 static volatile cpuset_t ipi_stop_nmi_pending; 123 124 volatile cpuset_t resuming_cpus; 125 volatile cpuset_t toresume_cpus; 126 127 /* used to hold the AP's until we are ready to release them */ 128 struct mtx ap_boot_mtx; 129 130 /* Set to 1 once we're ready to let the APs out of the pen. */ 131 volatile int aps_ready = 0; 132 133 /* 134 * Store data from cpu_add() until later in the boot when we actually setup 135 * the APs. 136 */ 137 struct cpu_info *cpu_info; 138 int *apic_cpuids; 139 int cpu_apic_ids[MAXCPU]; 140 _Static_assert(MAXCPU <= MAX_APIC_ID, 141 "MAXCPU cannot be larger that MAX_APIC_ID"); 142 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, 143 "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); 144 145 static void release_aps(void *dummy); 146 static void cpustop_handler_post(u_int cpu); 147 148 static int hyperthreading_allowed = 1; 149 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 150 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 151 152 static int hyperthreading_intr_allowed = 0; 153 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, 154 &hyperthreading_intr_allowed, 0, 155 "Allow interrupts on HTT logical CPUs"); 156 157 static struct topo_node topo_root; 158 159 static int pkg_id_shift; 160 static int node_id_shift; 161 static int core_id_shift; 162 static int disabled_cpus; 163 164 struct cache_info { 165 int id_shift; 166 int present; 167 } static caches[MAX_CACHE_LEVELS]; 168 169 unsigned int boot_address; 170 171 static bool stop_mwait = false; 172 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, 173 "Use MONITOR/MWAIT when stopping CPU, if available"); 174 175 #define MiB(v) (v ## ULL << 20) 176 177 void 178 mem_range_AP_init(void) 179 { 180 181 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 182 mem_range_softc.mr_op->initAP(&mem_range_softc); 183 } 184 185 /* 186 * Round up to the next power of two, if necessary, and then 187 * take log2. 188 * Returns -1 if argument is zero. 189 */ 190 static __inline int 191 mask_width(u_int x) 192 { 193 194 return (fls(x << (1 - powerof2(x))) - 1); 195 } 196 197 /* 198 * Add a cache level to the cache topology description. 199 */ 200 static int 201 add_deterministic_cache(int type, int level, int share_count) 202 { 203 204 if (type == 0) 205 return (0); 206 if (type > 3) { 207 printf("unexpected cache type %d\n", type); 208 return (1); 209 } 210 if (type == 2) /* ignore instruction cache */ 211 return (1); 212 if (level == 0 || level > MAX_CACHE_LEVELS) { 213 printf("unexpected cache level %d\n", type); 214 return (1); 215 } 216 217 if (caches[level - 1].present) { 218 printf("WARNING: multiple entries for L%u data cache\n", level); 219 printf("%u => %u\n", caches[level - 1].id_shift, 220 mask_width(share_count)); 221 } 222 caches[level - 1].id_shift = mask_width(share_count); 223 caches[level - 1].present = 1; 224 225 if (caches[level - 1].id_shift > pkg_id_shift) { 226 printf("WARNING: L%u data cache covers more " 227 "APIC IDs than a package (%u > %u)\n", level, 228 caches[level - 1].id_shift, pkg_id_shift); 229 caches[level - 1].id_shift = pkg_id_shift; 230 } 231 if (caches[level - 1].id_shift < core_id_shift) { 232 printf("WARNING: L%u data cache covers fewer " 233 "APIC IDs than a core (%u < %u)\n", level, 234 caches[level - 1].id_shift, core_id_shift); 235 caches[level - 1].id_shift = core_id_shift; 236 } 237 238 return (1); 239 } 240 241 /* 242 * Determine topology of processing units and caches for AMD CPUs. 243 * See: 244 * - AMD CPUID Specification (Publication # 25481) 245 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 246 * - BKDG For AMD Family 10h Processors (Publication # 31116) 247 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 248 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 249 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) 250 */ 251 static void 252 topo_probe_amd(void) 253 { 254 u_int p[4]; 255 uint64_t v; 256 int level; 257 int nodes_per_socket; 258 int share_count; 259 int type; 260 int i; 261 262 /* No multi-core capability. */ 263 if ((amd_feature2 & AMDID2_CMP) == 0) 264 return; 265 266 /* For families 10h and newer. */ 267 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 268 AMDID_COREID_SIZE_SHIFT; 269 270 /* For 0Fh family. */ 271 if (pkg_id_shift == 0) 272 pkg_id_shift = 273 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 274 275 /* 276 * Families prior to 16h define the following value as 277 * cores per compute unit and we don't really care about the AMD 278 * compute units at the moment. Perhaps we should treat them as 279 * cores and cores within the compute units as hardware threads, 280 * but that's up for debate. 281 * Later families define the value as threads per compute unit, 282 * so we are following AMD's nomenclature here. 283 */ 284 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 285 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 286 cpuid_count(0x8000001e, 0, p); 287 share_count = ((p[1] >> 8) & 0xff) + 1; 288 core_id_shift = mask_width(share_count); 289 290 /* 291 * For Zen (17h), gather Nodes per Processor. Each node is a 292 * Zeppelin die; TR and EPYC CPUs will have multiple dies per 293 * package. Communication latency between dies is higher than 294 * within them. 295 */ 296 nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; 297 node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); 298 } 299 300 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 301 for (i = 0; ; i++) { 302 cpuid_count(0x8000001d, i, p); 303 type = p[0] & 0x1f; 304 level = (p[0] >> 5) & 0x7; 305 share_count = 1 + ((p[0] >> 14) & 0xfff); 306 307 if (!add_deterministic_cache(type, level, share_count)) 308 break; 309 } 310 } else { 311 if (cpu_exthigh >= 0x80000005) { 312 cpuid_count(0x80000005, 0, p); 313 if (((p[2] >> 24) & 0xff) != 0) { 314 caches[0].id_shift = 0; 315 caches[0].present = 1; 316 } 317 } 318 if (cpu_exthigh >= 0x80000006) { 319 cpuid_count(0x80000006, 0, p); 320 if (((p[2] >> 16) & 0xffff) != 0) { 321 caches[1].id_shift = 0; 322 caches[1].present = 1; 323 } 324 if (((p[3] >> 18) & 0x3fff) != 0) { 325 nodes_per_socket = 1; 326 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 327 /* 328 * Handle multi-node processors that 329 * have multiple chips, each with its 330 * own L3 cache, on the same die. 331 */ 332 v = rdmsr(0xc001100c); 333 nodes_per_socket = 1 + ((v >> 3) & 0x7); 334 } 335 caches[2].id_shift = 336 pkg_id_shift - mask_width(nodes_per_socket); 337 caches[2].present = 1; 338 } 339 } 340 } 341 } 342 343 /* 344 * Determine topology of processing units for Intel CPUs 345 * using CPUID Leaf 1 and Leaf 4, if supported. 346 * See: 347 * - Intel 64 Architecture Processor Topology Enumeration 348 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 349 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 350 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 351 */ 352 static void 353 topo_probe_intel_0x4(void) 354 { 355 u_int p[4]; 356 int max_cores; 357 int max_logical; 358 359 /* Both zero and one here mean one logical processor per package. */ 360 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 361 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 362 if (max_logical <= 1) 363 return; 364 365 if (cpu_high >= 0x4) { 366 cpuid_count(0x04, 0, p); 367 max_cores = ((p[0] >> 26) & 0x3f) + 1; 368 } else 369 max_cores = 1; 370 371 core_id_shift = mask_width(max_logical/max_cores); 372 KASSERT(core_id_shift >= 0, 373 ("intel topo: max_cores > max_logical\n")); 374 pkg_id_shift = core_id_shift + mask_width(max_cores); 375 } 376 377 /* 378 * Determine topology of processing units for Intel CPUs 379 * using CPUID Leaf 11, if supported. 380 * See: 381 * - Intel 64 Architecture Processor Topology Enumeration 382 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 383 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 384 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 385 */ 386 static void 387 topo_probe_intel_0xb(void) 388 { 389 u_int p[4]; 390 int bits; 391 int type; 392 int i; 393 394 /* Fall back if CPU leaf 11 doesn't really exist. */ 395 cpuid_count(0x0b, 0, p); 396 if (p[1] == 0) { 397 topo_probe_intel_0x4(); 398 return; 399 } 400 401 /* We only support three levels for now. */ 402 for (i = 0; ; i++) { 403 cpuid_count(0x0b, i, p); 404 405 bits = p[0] & 0x1f; 406 type = (p[2] >> 8) & 0xff; 407 408 if (type == 0) 409 break; 410 411 /* TODO: check for duplicate (re-)assignment */ 412 if (type == CPUID_TYPE_SMT) 413 core_id_shift = bits; 414 else if (type == CPUID_TYPE_CORE) 415 pkg_id_shift = bits; 416 else 417 printf("unknown CPU level type %d\n", type); 418 } 419 420 if (pkg_id_shift < core_id_shift) { 421 printf("WARNING: core covers more APIC IDs than a package\n"); 422 core_id_shift = pkg_id_shift; 423 } 424 } 425 426 /* 427 * Determine topology of caches for Intel CPUs. 428 * See: 429 * - Intel 64 Architecture Processor Topology Enumeration 430 * - Intel 64 and IA-32 Architectures Software Developer’s Manual 431 * Volume 2A: Instruction Set Reference, A-M, 432 * CPUID instruction 433 */ 434 static void 435 topo_probe_intel_caches(void) 436 { 437 u_int p[4]; 438 int level; 439 int share_count; 440 int type; 441 int i; 442 443 if (cpu_high < 0x4) { 444 /* 445 * Available cache level and sizes can be determined 446 * via CPUID leaf 2, but that requires a huge table of hardcoded 447 * values, so for now just assume L1 and L2 caches potentially 448 * shared only by HTT processing units, if HTT is present. 449 */ 450 caches[0].id_shift = pkg_id_shift; 451 caches[0].present = 1; 452 caches[1].id_shift = pkg_id_shift; 453 caches[1].present = 1; 454 return; 455 } 456 457 for (i = 0; ; i++) { 458 cpuid_count(0x4, i, p); 459 type = p[0] & 0x1f; 460 level = (p[0] >> 5) & 0x7; 461 share_count = 1 + ((p[0] >> 14) & 0xfff); 462 463 if (!add_deterministic_cache(type, level, share_count)) 464 break; 465 } 466 } 467 468 /* 469 * Determine topology of processing units and caches for Intel CPUs. 470 * See: 471 * - Intel 64 Architecture Processor Topology Enumeration 472 */ 473 static void 474 topo_probe_intel(void) 475 { 476 477 /* 478 * Note that 0x1 <= cpu_high < 4 case should be 479 * compatible with topo_probe_intel_0x4() logic when 480 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 481 * or it should trigger the fallback otherwise. 482 */ 483 if (cpu_high >= 0xb) 484 topo_probe_intel_0xb(); 485 else if (cpu_high >= 0x1) 486 topo_probe_intel_0x4(); 487 488 topo_probe_intel_caches(); 489 } 490 491 /* 492 * Topology information is queried only on BSP, on which this 493 * code runs and for which it can query CPUID information. 494 * Then topology is extrapolated on all packages using an 495 * assumption that APIC ID to hardware component ID mapping is 496 * homogenious. 497 * That doesn't necesserily imply that the topology is uniform. 498 */ 499 void 500 topo_probe(void) 501 { 502 static int cpu_topo_probed = 0; 503 struct x86_topo_layer { 504 int type; 505 int subtype; 506 int id_shift; 507 } topo_layers[MAX_CACHE_LEVELS + 4]; 508 struct topo_node *parent; 509 struct topo_node *node; 510 int layer; 511 int nlayers; 512 int node_id; 513 int i; 514 515 if (cpu_topo_probed) 516 return; 517 518 CPU_ZERO(&logical_cpus_mask); 519 520 if (mp_ncpus <= 1) 521 ; /* nothing */ 522 else if (cpu_vendor_id == CPU_VENDOR_AMD || 523 cpu_vendor_id == CPU_VENDOR_HYGON) 524 topo_probe_amd(); 525 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 526 topo_probe_intel(); 527 528 KASSERT(pkg_id_shift >= core_id_shift, 529 ("bug in APIC topology discovery")); 530 531 nlayers = 0; 532 bzero(topo_layers, sizeof(topo_layers)); 533 534 topo_layers[nlayers].type = TOPO_TYPE_PKG; 535 topo_layers[nlayers].id_shift = pkg_id_shift; 536 if (bootverbose) 537 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 538 nlayers++; 539 540 if (pkg_id_shift > node_id_shift && node_id_shift != 0) { 541 topo_layers[nlayers].type = TOPO_TYPE_GROUP; 542 topo_layers[nlayers].id_shift = node_id_shift; 543 if (bootverbose) 544 printf("Node ID shift: %u\n", 545 topo_layers[nlayers].id_shift); 546 nlayers++; 547 } 548 549 /* 550 * Consider all caches to be within a package/chip 551 * and "in front" of all sub-components like 552 * cores and hardware threads. 553 */ 554 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 555 if (caches[i].present) { 556 if (node_id_shift != 0) 557 KASSERT(caches[i].id_shift <= node_id_shift, 558 ("bug in APIC topology discovery")); 559 KASSERT(caches[i].id_shift <= pkg_id_shift, 560 ("bug in APIC topology discovery")); 561 KASSERT(caches[i].id_shift >= core_id_shift, 562 ("bug in APIC topology discovery")); 563 564 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 565 topo_layers[nlayers].subtype = i + 1; 566 topo_layers[nlayers].id_shift = caches[i].id_shift; 567 if (bootverbose) 568 printf("L%u cache ID shift: %u\n", 569 topo_layers[nlayers].subtype, 570 topo_layers[nlayers].id_shift); 571 nlayers++; 572 } 573 } 574 575 if (pkg_id_shift > core_id_shift) { 576 topo_layers[nlayers].type = TOPO_TYPE_CORE; 577 topo_layers[nlayers].id_shift = core_id_shift; 578 if (bootverbose) 579 printf("Core ID shift: %u\n", 580 topo_layers[nlayers].id_shift); 581 nlayers++; 582 } 583 584 topo_layers[nlayers].type = TOPO_TYPE_PU; 585 topo_layers[nlayers].id_shift = 0; 586 nlayers++; 587 588 topo_init_root(&topo_root); 589 for (i = 0; i <= max_apic_id; ++i) { 590 if (!cpu_info[i].cpu_present) 591 continue; 592 593 parent = &topo_root; 594 for (layer = 0; layer < nlayers; ++layer) { 595 node_id = i >> topo_layers[layer].id_shift; 596 parent = topo_add_node_by_hwid(parent, node_id, 597 topo_layers[layer].type, 598 topo_layers[layer].subtype); 599 } 600 } 601 602 parent = &topo_root; 603 for (layer = 0; layer < nlayers; ++layer) { 604 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 605 node = topo_find_node_by_hwid(parent, node_id, 606 topo_layers[layer].type, 607 topo_layers[layer].subtype); 608 topo_promote_child(node); 609 parent = node; 610 } 611 612 cpu_topo_probed = 1; 613 } 614 615 /* 616 * Assign logical CPU IDs to local APICs. 617 */ 618 void 619 assign_cpu_ids(void) 620 { 621 struct topo_node *node; 622 u_int smt_mask; 623 int nhyper; 624 625 smt_mask = (1u << core_id_shift) - 1; 626 627 /* 628 * Assign CPU IDs to local APIC IDs and disable any CPUs 629 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 630 */ 631 mp_ncpus = 0; 632 nhyper = 0; 633 TOPO_FOREACH(node, &topo_root) { 634 if (node->type != TOPO_TYPE_PU) 635 continue; 636 637 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 638 cpu_info[node->hwid].cpu_hyperthread = 1; 639 640 if (resource_disabled("lapic", node->hwid)) { 641 if (node->hwid != boot_cpu_id) 642 cpu_info[node->hwid].cpu_disabled = 1; 643 else 644 printf("Cannot disable BSP, APIC ID = %d\n", 645 node->hwid); 646 } 647 648 if (!hyperthreading_allowed && 649 cpu_info[node->hwid].cpu_hyperthread) 650 cpu_info[node->hwid].cpu_disabled = 1; 651 652 if (mp_ncpus >= MAXCPU) 653 cpu_info[node->hwid].cpu_disabled = 1; 654 655 if (cpu_info[node->hwid].cpu_disabled) { 656 disabled_cpus++; 657 continue; 658 } 659 660 if (cpu_info[node->hwid].cpu_hyperthread) 661 nhyper++; 662 663 cpu_apic_ids[mp_ncpus] = node->hwid; 664 apic_cpuids[node->hwid] = mp_ncpus; 665 topo_set_pu_id(node, mp_ncpus); 666 mp_ncpus++; 667 } 668 669 KASSERT(mp_maxid >= mp_ncpus - 1, 670 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 671 mp_ncpus)); 672 673 mp_ncores = mp_ncpus - nhyper; 674 smp_threads_per_core = mp_ncpus / mp_ncores; 675 } 676 677 /* 678 * Print various information about the SMP system hardware and setup. 679 */ 680 void 681 cpu_mp_announce(void) 682 { 683 struct topo_node *node; 684 const char *hyperthread; 685 struct topo_analysis topology; 686 687 printf("FreeBSD/SMP: "); 688 if (topo_analyze(&topo_root, 1, &topology)) { 689 printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); 690 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 691 printf(" x %d groups", 692 topology.entities[TOPO_LEVEL_GROUP]); 693 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 694 printf(" x %d cache groups", 695 topology.entities[TOPO_LEVEL_CACHEGROUP]); 696 if (topology.entities[TOPO_LEVEL_CORE] > 0) 697 printf(" x %d core(s)", 698 topology.entities[TOPO_LEVEL_CORE]); 699 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 700 printf(" x %d hardware threads", 701 topology.entities[TOPO_LEVEL_THREAD]); 702 } else { 703 printf("Non-uniform topology"); 704 } 705 printf("\n"); 706 707 if (disabled_cpus) { 708 printf("FreeBSD/SMP Online: "); 709 if (topo_analyze(&topo_root, 0, &topology)) { 710 printf("%d package(s)", 711 topology.entities[TOPO_LEVEL_PKG]); 712 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 713 printf(" x %d groups", 714 topology.entities[TOPO_LEVEL_GROUP]); 715 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 716 printf(" x %d cache groups", 717 topology.entities[TOPO_LEVEL_CACHEGROUP]); 718 if (topology.entities[TOPO_LEVEL_CORE] > 0) 719 printf(" x %d core(s)", 720 topology.entities[TOPO_LEVEL_CORE]); 721 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 722 printf(" x %d hardware threads", 723 topology.entities[TOPO_LEVEL_THREAD]); 724 } else { 725 printf("Non-uniform topology"); 726 } 727 printf("\n"); 728 } 729 730 if (!bootverbose) 731 return; 732 733 TOPO_FOREACH(node, &topo_root) { 734 switch (node->type) { 735 case TOPO_TYPE_PKG: 736 printf("Package HW ID = %u\n", node->hwid); 737 break; 738 case TOPO_TYPE_CORE: 739 printf("\tCore HW ID = %u\n", node->hwid); 740 break; 741 case TOPO_TYPE_PU: 742 if (cpu_info[node->hwid].cpu_hyperthread) 743 hyperthread = "/HT"; 744 else 745 hyperthread = ""; 746 747 if (node->subtype == 0) 748 printf("\t\tCPU (AP%s): APIC ID: %u" 749 "(disabled)\n", hyperthread, node->hwid); 750 else if (node->id == 0) 751 printf("\t\tCPU0 (BSP): APIC ID: %u\n", 752 node->hwid); 753 else 754 printf("\t\tCPU%u (AP%s): APIC ID: %u\n", 755 node->id, hyperthread, node->hwid); 756 break; 757 default: 758 /* ignored */ 759 break; 760 } 761 } 762 } 763 764 /* 765 * Add a scheduling group, a group of logical processors sharing 766 * a particular cache (and, thus having an affinity), to the scheduling 767 * topology. 768 * This function recursively works on lower level caches. 769 */ 770 static void 771 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 772 { 773 struct topo_node *node; 774 int nchildren; 775 int ncores; 776 int i; 777 778 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || 779 root->type == TOPO_TYPE_GROUP, 780 ("x86topo_add_sched_group: bad type: %u", root->type)); 781 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 782 cg_root->cg_count = root->cpu_count; 783 if (root->type == TOPO_TYPE_SYSTEM) 784 cg_root->cg_level = CG_SHARE_NONE; 785 else 786 cg_root->cg_level = root->subtype; 787 788 /* 789 * Check how many core nodes we have under the given root node. 790 * If we have multiple logical processors, but not multiple 791 * cores, then those processors must be hardware threads. 792 */ 793 ncores = 0; 794 node = root; 795 while (node != NULL) { 796 if (node->type != TOPO_TYPE_CORE) { 797 node = topo_next_node(root, node); 798 continue; 799 } 800 801 ncores++; 802 node = topo_next_nonchild_node(root, node); 803 } 804 805 if (cg_root->cg_level != CG_SHARE_NONE && 806 root->cpu_count > 1 && ncores < 2) 807 cg_root->cg_flags = CG_FLAG_SMT; 808 809 /* 810 * Find out how many cache nodes we have under the given root node. 811 * We ignore cache nodes that cover all the same processors as the 812 * root node. Also, we do not descend below found cache nodes. 813 * That is, we count top-level "non-redundant" caches under the root 814 * node. 815 */ 816 nchildren = 0; 817 node = root; 818 while (node != NULL) { 819 if ((node->type != TOPO_TYPE_GROUP && 820 node->type != TOPO_TYPE_CACHE) || 821 (root->type != TOPO_TYPE_SYSTEM && 822 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 823 node = topo_next_node(root, node); 824 continue; 825 } 826 nchildren++; 827 node = topo_next_nonchild_node(root, node); 828 } 829 830 cg_root->cg_child = smp_topo_alloc(nchildren); 831 cg_root->cg_children = nchildren; 832 833 /* 834 * Now find again the same cache nodes as above and recursively 835 * build scheduling topologies for them. 836 */ 837 node = root; 838 i = 0; 839 while (node != NULL) { 840 if ((node->type != TOPO_TYPE_GROUP && 841 node->type != TOPO_TYPE_CACHE) || 842 (root->type != TOPO_TYPE_SYSTEM && 843 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 844 node = topo_next_node(root, node); 845 continue; 846 } 847 cg_root->cg_child[i].cg_parent = cg_root; 848 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 849 i++; 850 node = topo_next_nonchild_node(root, node); 851 } 852 } 853 854 /* 855 * Build the MI scheduling topology from the discovered hardware topology. 856 */ 857 struct cpu_group * 858 cpu_topo(void) 859 { 860 struct cpu_group *cg_root; 861 862 if (mp_ncpus <= 1) 863 return (smp_topo_none()); 864 865 cg_root = smp_topo_alloc(1); 866 x86topo_add_sched_group(&topo_root, cg_root); 867 return (cg_root); 868 } 869 870 static void 871 cpu_alloc(void *dummy __unused) 872 { 873 /* 874 * Dynamically allocate the arrays that depend on the 875 * maximum APIC ID. 876 */ 877 cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, 878 M_WAITOK | M_ZERO); 879 apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, 880 M_WAITOK | M_ZERO); 881 } 882 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); 883 884 /* 885 * Add a logical CPU to the topology. 886 */ 887 void 888 cpu_add(u_int apic_id, char boot_cpu) 889 { 890 891 if (apic_id > max_apic_id) { 892 panic("SMP: APIC ID %d too high", apic_id); 893 return; 894 } 895 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", 896 apic_id)); 897 cpu_info[apic_id].cpu_present = 1; 898 if (boot_cpu) { 899 KASSERT(boot_cpu_id == -1, 900 ("CPU %u claims to be BSP, but CPU %u already is", apic_id, 901 boot_cpu_id)); 902 boot_cpu_id = apic_id; 903 cpu_info[apic_id].cpu_bsp = 1; 904 } 905 if (bootverbose) 906 printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : 907 "AP"); 908 } 909 910 void 911 cpu_mp_setmaxid(void) 912 { 913 914 /* 915 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 916 * If there were no calls to cpu_add() assume this is a UP system. 917 */ 918 if (mp_ncpus == 0) 919 mp_ncpus = 1; 920 } 921 922 int 923 cpu_mp_probe(void) 924 { 925 926 /* 927 * Always record BSP in CPU map so that the mbuf init code works 928 * correctly. 929 */ 930 CPU_SETOF(0, &all_cpus); 931 return (mp_ncpus > 1); 932 } 933 934 /* Allocate memory for the AP trampoline. */ 935 void 936 alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx) 937 { 938 unsigned int i; 939 bool allocated; 940 941 allocated = false; 942 for (i = *physmap_idx; i <= *physmap_idx; i -= 2) { 943 /* 944 * Find a memory region big enough and below the 1MB boundary 945 * for the trampoline code. 946 * NB: needs to be page aligned. 947 */ 948 if (physmap[i] >= MiB(1) || 949 (trunc_page(physmap[i + 1]) - round_page(physmap[i])) < 950 round_page(bootMP_size)) 951 continue; 952 953 allocated = true; 954 /* 955 * Try to steal from the end of the region to mimic previous 956 * behaviour, else fallback to steal from the start. 957 */ 958 if (physmap[i + 1] < MiB(1)) { 959 boot_address = trunc_page(physmap[i + 1]); 960 if ((physmap[i + 1] - boot_address) < bootMP_size) 961 boot_address -= round_page(bootMP_size); 962 physmap[i + 1] = boot_address; 963 } else { 964 boot_address = round_page(physmap[i]); 965 physmap[i] = boot_address + round_page(bootMP_size); 966 } 967 if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) { 968 memmove(&physmap[i], &physmap[i + 2], 969 sizeof(*physmap) * (*physmap_idx - i + 2)); 970 *physmap_idx -= 2; 971 } 972 break; 973 } 974 975 if (!allocated) { 976 boot_address = basemem * 1024 - bootMP_size; 977 if (bootverbose) 978 printf( 979 "Cannot find enough space for the boot trampoline, placing it at %#x", 980 boot_address); 981 } 982 } 983 984 /* 985 * AP CPU's call this to initialize themselves. 986 */ 987 void 988 init_secondary_tail(void) 989 { 990 u_int cpuid; 991 992 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); 993 994 /* 995 * On real hardware, switch to x2apic mode if possible. Do it 996 * after aps_ready was signalled, to avoid manipulating the 997 * mode while BSP might still want to send some IPI to us 998 * (second startup IPI is ignored on modern hardware etc). 999 */ 1000 lapic_xapic_mode(); 1001 1002 /* Initialize the PAT MSR. */ 1003 pmap_init_pat(); 1004 1005 /* set up CPU registers and state */ 1006 cpu_setregs(); 1007 1008 /* set up SSE/NX */ 1009 initializecpu(); 1010 1011 /* set up FPU state on the AP */ 1012 #ifdef __amd64__ 1013 fpuinit(); 1014 #else 1015 npxinit(false); 1016 #endif 1017 1018 if (cpu_ops.cpu_init) 1019 cpu_ops.cpu_init(); 1020 1021 /* A quick check from sanity claus */ 1022 cpuid = PCPU_GET(cpuid); 1023 if (PCPU_GET(apic_id) != lapic_id()) { 1024 printf("SMP: cpuid = %d\n", cpuid); 1025 printf("SMP: actual apic_id = %d\n", lapic_id()); 1026 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 1027 panic("cpuid mismatch! boom!!"); 1028 } 1029 1030 /* Initialize curthread. */ 1031 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 1032 PCPU_SET(curthread, PCPU_GET(idlethread)); 1033 1034 mtx_lock_spin(&ap_boot_mtx); 1035 1036 mca_init(); 1037 1038 /* Init local apic for irq's */ 1039 lapic_setup(1); 1040 1041 /* Set memory range attributes for this CPU to match the BSP */ 1042 mem_range_AP_init(); 1043 1044 smp_cpus++; 1045 1046 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 1047 if (bootverbose) 1048 printf("SMP: AP CPU #%d Launched!\n", cpuid); 1049 else 1050 printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", 1051 cpuid, smp_cpus == mp_ncpus ? "\n" : " "); 1052 1053 /* Determine if we are a logical CPU. */ 1054 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 1055 CPU_SET(cpuid, &logical_cpus_mask); 1056 1057 if (bootverbose) 1058 lapic_dump("AP"); 1059 1060 if (smp_cpus == mp_ncpus) { 1061 /* enable IPI's, tlb shootdown, freezes etc */ 1062 atomic_store_rel_int(&smp_started, 1); 1063 } 1064 1065 #ifdef __amd64__ 1066 /* 1067 * Enable global pages TLB extension 1068 * This also implicitly flushes the TLB 1069 */ 1070 load_cr4(rcr4() | CR4_PGE); 1071 if (pmap_pcid_enabled) 1072 load_cr4(rcr4() | CR4_PCIDE); 1073 load_ds(_udatasel); 1074 load_es(_udatasel); 1075 load_fs(_ufssel); 1076 #endif 1077 1078 mtx_unlock_spin(&ap_boot_mtx); 1079 1080 /* Wait until all the AP's are up. */ 1081 while (atomic_load_acq_int(&smp_started) == 0) 1082 ia32_pause(); 1083 1084 #ifndef EARLY_AP_STARTUP 1085 /* Start per-CPU event timers. */ 1086 cpu_initclocks_ap(); 1087 #endif 1088 1089 kcsan_cpu_init(cpuid); 1090 1091 /* 1092 * Assert that smp_after_idle_runnable condition is reasonable. 1093 */ 1094 MPASS(PCPU_GET(curpcb) == NULL); 1095 1096 sched_throw(NULL); 1097 1098 panic("scheduler returned us to %s", __func__); 1099 /* NOTREACHED */ 1100 } 1101 1102 static void 1103 smp_after_idle_runnable(void *arg __unused) 1104 { 1105 struct pcpu *pc; 1106 int cpu; 1107 1108 for (cpu = 1; cpu < mp_ncpus; cpu++) { 1109 pc = pcpu_find(cpu); 1110 while (atomic_load_ptr(&pc->pc_curpcb) == NULL) 1111 cpu_spinwait(); 1112 kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages * 1113 PAGE_SIZE); 1114 } 1115 } 1116 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, 1117 smp_after_idle_runnable, NULL); 1118 1119 /* 1120 * We tell the I/O APIC code about all the CPUs we want to receive 1121 * interrupts. If we don't want certain CPUs to receive IRQs we 1122 * can simply not tell the I/O APIC code about them in this function. 1123 * We also do not tell it about the BSP since it tells itself about 1124 * the BSP internally to work with UP kernels and on UP machines. 1125 */ 1126 void 1127 set_interrupt_apic_ids(void) 1128 { 1129 u_int i, apic_id; 1130 1131 for (i = 0; i < MAXCPU; i++) { 1132 apic_id = cpu_apic_ids[i]; 1133 if (apic_id == -1) 1134 continue; 1135 if (cpu_info[apic_id].cpu_bsp) 1136 continue; 1137 if (cpu_info[apic_id].cpu_disabled) 1138 continue; 1139 1140 /* Don't let hyperthreads service interrupts. */ 1141 if (cpu_info[apic_id].cpu_hyperthread && 1142 !hyperthreading_intr_allowed) 1143 continue; 1144 1145 intr_add_cpu(i); 1146 } 1147 } 1148 1149 #ifdef COUNT_XINVLTLB_HITS 1150 u_int xhits_gbl[MAXCPU]; 1151 u_int xhits_pg[MAXCPU]; 1152 u_int xhits_rng[MAXCPU]; 1153 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1154 ""); 1155 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1156 sizeof(xhits_gbl), "IU", ""); 1157 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1158 sizeof(xhits_pg), "IU", ""); 1159 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1160 sizeof(xhits_rng), "IU", ""); 1161 1162 u_int ipi_global; 1163 u_int ipi_page; 1164 u_int ipi_range; 1165 u_int ipi_range_size; 1166 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1167 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1168 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1169 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1170 0, ""); 1171 #endif /* COUNT_XINVLTLB_HITS */ 1172 1173 /* 1174 * Init and startup IPI. 1175 */ 1176 void 1177 ipi_startup(int apic_id, int vector) 1178 { 1179 1180 /* 1181 * This attempts to follow the algorithm described in the 1182 * Intel Multiprocessor Specification v1.4 in section B.4. 1183 * For each IPI, we allow the local APIC ~20us to deliver the 1184 * IPI. If that times out, we panic. 1185 */ 1186 1187 /* 1188 * first we do an INIT IPI: this INIT IPI might be run, resetting 1189 * and running the target CPU. OR this INIT IPI might be latched (P5 1190 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1191 * ignored. 1192 */ 1193 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1194 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1195 lapic_ipi_wait(100); 1196 1197 /* Explicitly deassert the INIT IPI. */ 1198 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1199 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1200 apic_id); 1201 1202 DELAY(10000); /* wait ~10mS */ 1203 1204 /* 1205 * next we do a STARTUP IPI: the previous INIT IPI might still be 1206 * latched, (P5 bug) this 1st STARTUP would then terminate 1207 * immediately, and the previously started INIT IPI would continue. OR 1208 * the previous INIT IPI has already run. and this STARTUP IPI will 1209 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1210 * will run. 1211 */ 1212 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1213 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1214 vector, apic_id); 1215 if (!lapic_ipi_wait(100)) 1216 panic("Failed to deliver first STARTUP IPI to APIC %d", 1217 apic_id); 1218 DELAY(200); /* wait ~200uS */ 1219 1220 /* 1221 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1222 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1223 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1224 * recognized after hardware RESET or INIT IPI. 1225 */ 1226 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1227 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1228 vector, apic_id); 1229 if (!lapic_ipi_wait(100)) 1230 panic("Failed to deliver second STARTUP IPI to APIC %d", 1231 apic_id); 1232 1233 DELAY(200); /* wait ~200uS */ 1234 } 1235 1236 static bool 1237 ipi_bitmap_set(int cpu, u_int ipi) 1238 { 1239 u_int bitmap, old, new; 1240 u_int *cpu_bitmap; 1241 1242 bitmap = 1 << ipi; 1243 cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap; 1244 old = *cpu_bitmap; 1245 for (;;) { 1246 if ((old & bitmap) != 0) 1247 break; 1248 new = old | bitmap; 1249 if (atomic_fcmpset_int(cpu_bitmap, &old, new)) 1250 break; 1251 } 1252 return (old != 0); 1253 } 1254 1255 /* 1256 * Send an IPI to specified CPU handling the bitmap logic. 1257 */ 1258 static void 1259 ipi_send_cpu(int cpu, u_int ipi) 1260 { 1261 1262 KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1, 1263 ("IPI to non-existent CPU %d", cpu)); 1264 1265 if (IPI_IS_BITMAPED(ipi)) { 1266 if (ipi_bitmap_set(cpu, ipi)) 1267 return; 1268 ipi = IPI_BITMAP_VECTOR; 1269 } 1270 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1271 } 1272 1273 void 1274 ipi_bitmap_handler(struct trapframe frame) 1275 { 1276 struct trapframe *oldframe; 1277 struct thread *td; 1278 int cpu = PCPU_GET(cpuid); 1279 u_int ipi_bitmap; 1280 1281 td = curthread; 1282 ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]-> 1283 pc_ipi_bitmap); 1284 1285 /* 1286 * sched_preempt() must be called to clear the pending preempt 1287 * IPI to enable delivery of further preempts. However, the 1288 * critical section will cause extra scheduler lock thrashing 1289 * when used unconditionally. Only critical_enter() if 1290 * hardclock must also run, which requires the section entry. 1291 */ 1292 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1293 critical_enter(); 1294 1295 td->td_intr_nesting_level++; 1296 oldframe = td->td_intr_frame; 1297 td->td_intr_frame = &frame; 1298 #if defined(STACK) || defined(DDB) 1299 if (ipi_bitmap & (1 << IPI_TRACE)) 1300 stack_capture_intr(); 1301 #endif 1302 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1303 #ifdef COUNT_IPIS 1304 (*ipi_preempt_counts[cpu])++; 1305 #endif 1306 sched_preempt(td); 1307 } 1308 if (ipi_bitmap & (1 << IPI_AST)) { 1309 #ifdef COUNT_IPIS 1310 (*ipi_ast_counts[cpu])++; 1311 #endif 1312 /* Nothing to do for AST */ 1313 } 1314 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1315 #ifdef COUNT_IPIS 1316 (*ipi_hardclock_counts[cpu])++; 1317 #endif 1318 hardclockintr(); 1319 } 1320 td->td_intr_frame = oldframe; 1321 td->td_intr_nesting_level--; 1322 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1323 critical_exit(); 1324 } 1325 1326 /* 1327 * send an IPI to a set of cpus. 1328 */ 1329 void 1330 ipi_selected(cpuset_t cpus, u_int ipi) 1331 { 1332 int cpu; 1333 1334 /* 1335 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1336 * of help in order to understand what is the source. 1337 * Set the mask of receiving CPUs for this purpose. 1338 */ 1339 if (ipi == IPI_STOP_HARD) 1340 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1341 1342 while ((cpu = CPU_FFS(&cpus)) != 0) { 1343 cpu--; 1344 CPU_CLR(cpu, &cpus); 1345 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1346 ipi_send_cpu(cpu, ipi); 1347 } 1348 } 1349 1350 /* 1351 * send an IPI to a specific CPU. 1352 */ 1353 void 1354 ipi_cpu(int cpu, u_int ipi) 1355 { 1356 1357 /* 1358 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1359 * of help in order to understand what is the source. 1360 * Set the mask of receiving CPUs for this purpose. 1361 */ 1362 if (ipi == IPI_STOP_HARD) 1363 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1364 1365 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1366 ipi_send_cpu(cpu, ipi); 1367 } 1368 1369 /* 1370 * send an IPI to all CPUs EXCEPT myself 1371 */ 1372 void 1373 ipi_all_but_self(u_int ipi) 1374 { 1375 cpuset_t other_cpus; 1376 int cpu, c; 1377 1378 /* 1379 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1380 * of help in order to understand what is the source. 1381 * Set the mask of receiving CPUs for this purpose. 1382 */ 1383 if (ipi == IPI_STOP_HARD) { 1384 other_cpus = all_cpus; 1385 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1386 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1387 } 1388 1389 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1390 if (IPI_IS_BITMAPED(ipi)) { 1391 cpu = PCPU_GET(cpuid); 1392 CPU_FOREACH(c) { 1393 if (c != cpu) 1394 ipi_bitmap_set(c, ipi); 1395 } 1396 ipi = IPI_BITMAP_VECTOR; 1397 } 1398 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1399 } 1400 1401 void 1402 ipi_self_from_nmi(u_int vector) 1403 { 1404 1405 lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF); 1406 1407 /* Wait for IPI to finish. */ 1408 if (!lapic_ipi_wait(50000)) { 1409 if (KERNEL_PANICKED()) 1410 return; 1411 else 1412 panic("APIC: IPI is stuck"); 1413 } 1414 } 1415 1416 int 1417 ipi_nmi_handler(void) 1418 { 1419 u_int cpuid; 1420 1421 /* 1422 * As long as there is not a simple way to know about a NMI's 1423 * source, if the bitmask for the current CPU is present in 1424 * the global pending bitword an IPI_STOP_HARD has been issued 1425 * and should be handled. 1426 */ 1427 cpuid = PCPU_GET(cpuid); 1428 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1429 return (1); 1430 1431 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1432 cpustop_handler(); 1433 return (0); 1434 } 1435 1436 int nmi_kdb_lock; 1437 1438 void 1439 nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1440 { 1441 int cpu; 1442 bool call_post; 1443 1444 cpu = PCPU_GET(cpuid); 1445 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1446 nmi_call_kdb(cpu, type, frame); 1447 call_post = false; 1448 } else { 1449 savectx(&stoppcbs[cpu]); 1450 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1451 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1452 ia32_pause(); 1453 call_post = true; 1454 } 1455 atomic_store_rel_int(&nmi_kdb_lock, 0); 1456 if (call_post) 1457 cpustop_handler_post(cpu); 1458 } 1459 1460 /* 1461 * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, 1462 * if available) until we are resumed. 1463 */ 1464 void 1465 cpustop_handler(void) 1466 { 1467 struct monitorbuf *mb; 1468 u_int cpu; 1469 bool use_mwait; 1470 1471 cpu = PCPU_GET(cpuid); 1472 1473 savectx(&stoppcbs[cpu]); 1474 1475 use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && 1476 !mwait_cpustop_broken); 1477 if (use_mwait) { 1478 mb = PCPU_PTR(monitorbuf); 1479 atomic_store_int(&mb->stop_state, 1480 MONITOR_STOPSTATE_STOPPED); 1481 } 1482 1483 /* Indicate that we are stopped */ 1484 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1485 1486 /* Wait for restart */ 1487 while (!CPU_ISSET(cpu, &started_cpus)) { 1488 if (use_mwait) { 1489 cpu_monitor(mb, 0, 0); 1490 if (atomic_load_int(&mb->stop_state) == 1491 MONITOR_STOPSTATE_STOPPED) 1492 cpu_mwait(0, MWAIT_C1); 1493 continue; 1494 } 1495 1496 ia32_pause(); 1497 1498 /* 1499 * Halt non-BSP CPUs on panic -- we're never going to need them 1500 * again, and might as well save power / release resources 1501 * (e.g., overprovisioned VM infrastructure). 1502 */ 1503 while (__predict_false(!IS_BSP() && KERNEL_PANICKED())) 1504 halt(); 1505 } 1506 1507 cpustop_handler_post(cpu); 1508 } 1509 1510 static void 1511 cpustop_handler_post(u_int cpu) 1512 { 1513 1514 CPU_CLR_ATOMIC(cpu, &started_cpus); 1515 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1516 1517 /* 1518 * We don't broadcast TLB invalidations to other CPUs when they are 1519 * stopped. Hence, we clear the TLB before resuming. 1520 */ 1521 invltlb_glob(); 1522 1523 #if defined(__amd64__) && defined(DDB) 1524 amd64_db_resume_dbreg(); 1525 #endif 1526 1527 if (cpu == 0 && cpustop_restartfunc != NULL) { 1528 cpustop_restartfunc(); 1529 cpustop_restartfunc = NULL; 1530 } 1531 } 1532 1533 /* 1534 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1535 * are resumed. 1536 */ 1537 void 1538 cpususpend_handler(void) 1539 { 1540 u_int cpu; 1541 1542 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1543 1544 cpu = PCPU_GET(cpuid); 1545 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1546 #ifdef __amd64__ 1547 fpususpend(susppcbs[cpu]->sp_fpususpend); 1548 #else 1549 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1550 #endif 1551 /* 1552 * suspended_cpus is cleared shortly after each AP is restarted 1553 * by a Startup IPI, so that the BSP can proceed to restarting 1554 * the next AP. 1555 * 1556 * resuming_cpus gets cleared when the AP completes 1557 * initialization after having been released by the BSP. 1558 * resuming_cpus is probably not the best name for the 1559 * variable, because it is actually a set of processors that 1560 * haven't resumed yet and haven't necessarily started resuming. 1561 * 1562 * Note that suspended_cpus is meaningful only for ACPI suspend 1563 * as it's not really used for Xen suspend since the APs are 1564 * automatically restored to the running state and the correct 1565 * context. For the same reason resumectx is never called in 1566 * that case. 1567 */ 1568 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1569 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1570 1571 /* 1572 * Invalidate the cache after setting the global status bits. 1573 * The last AP to set its bit may end up being an Owner of the 1574 * corresponding cache line in MOESI protocol. The AP may be 1575 * stopped before the cache line is written to the main memory. 1576 */ 1577 wbinvd(); 1578 } else { 1579 #ifdef __amd64__ 1580 fpuresume(susppcbs[cpu]->sp_fpususpend); 1581 #else 1582 npxresume(susppcbs[cpu]->sp_fpususpend); 1583 #endif 1584 pmap_init_pat(); 1585 initializecpu(); 1586 PCPU_SET(switchtime, 0); 1587 PCPU_SET(switchticks, ticks); 1588 1589 /* Indicate that we have restarted and restored the context. */ 1590 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1591 } 1592 1593 /* Wait for resume directive */ 1594 while (!CPU_ISSET(cpu, &toresume_cpus)) 1595 ia32_pause(); 1596 1597 /* Re-apply microcode updates. */ 1598 ucode_reload(); 1599 1600 #ifdef __i386__ 1601 /* Finish removing the identity mapping of low memory for this AP. */ 1602 invltlb_glob(); 1603 #endif 1604 1605 if (cpu_ops.cpu_resume) 1606 cpu_ops.cpu_resume(); 1607 #ifdef __amd64__ 1608 if (vmm_resume_p) 1609 vmm_resume_p(); 1610 #endif 1611 1612 /* Resume MCA and local APIC */ 1613 lapic_xapic_mode(); 1614 mca_resume(); 1615 lapic_setup(0); 1616 1617 /* Indicate that we are resumed */ 1618 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1619 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1620 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1621 } 1622 1623 /* 1624 * Handle an IPI_SWI by waking delayed SWI thread. 1625 */ 1626 void 1627 ipi_swi_handler(struct trapframe frame) 1628 { 1629 1630 intr_event_handle(clk_intr_event, &frame); 1631 } 1632 1633 /* 1634 * This is called once the rest of the system is up and running and we're 1635 * ready to let the AP's out of the pen. 1636 */ 1637 static void 1638 release_aps(void *dummy __unused) 1639 { 1640 1641 if (mp_ncpus == 1) 1642 return; 1643 atomic_store_rel_int(&aps_ready, 1); 1644 while (smp_started == 0) 1645 ia32_pause(); 1646 } 1647 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1648 1649 #ifdef COUNT_IPIS 1650 /* 1651 * Setup interrupt counters for IPI handlers. 1652 */ 1653 static void 1654 mp_ipi_intrcnt(void *dummy) 1655 { 1656 char buf[64]; 1657 int i; 1658 1659 CPU_FOREACH(i) { 1660 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1661 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1662 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1663 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1664 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1665 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1666 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1667 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1668 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1669 intrcnt_add(buf, &ipi_preempt_counts[i]); 1670 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1671 intrcnt_add(buf, &ipi_ast_counts[i]); 1672 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1673 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1674 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1675 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1676 } 1677 } 1678 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1679 #endif 1680