1 /*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #ifdef __i386__ 31 #include "opt_apic.h" 32 #endif 33 #include "opt_cpu.h" 34 #include "opt_kstack_pages.h" 35 #include "opt_pmap.h" 36 #include "opt_sched.h" 37 #include "opt_smp.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/bus.h> 42 #include <sys/cons.h> /* cngetc() */ 43 #include <sys/cpuset.h> 44 #ifdef GPROF 45 #include <sys/gmon.h> 46 #endif 47 #include <sys/kdb.h> 48 #include <sys/kernel.h> 49 #include <sys/ktr.h> 50 #include <sys/lock.h> 51 #include <sys/malloc.h> 52 #include <sys/memrange.h> 53 #include <sys/mutex.h> 54 #include <sys/pcpu.h> 55 #include <sys/proc.h> 56 #include <sys/sched.h> 57 #include <sys/smp.h> 58 #include <sys/sysctl.h> 59 60 #include <vm/vm.h> 61 #include <vm/vm_param.h> 62 #include <vm/pmap.h> 63 #include <vm/vm_kern.h> 64 #include <vm/vm_extern.h> 65 #include <vm/vm_map.h> 66 67 #include <x86/apicreg.h> 68 #include <machine/clock.h> 69 #include <machine/cpu.h> 70 #include <machine/cputypes.h> 71 #include <x86/mca.h> 72 #include <machine/md_var.h> 73 #include <machine/pcb.h> 74 #include <machine/psl.h> 75 #include <machine/smp.h> 76 #include <machine/specialreg.h> 77 #include <x86/ucode.h> 78 79 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); 80 81 /* lock region used by kernel profiling */ 82 int mcount_lock; 83 84 int mp_naps; /* # of Applications processors */ 85 int boot_cpu_id = -1; /* designated BSP */ 86 87 /* AP uses this during bootstrap. Do not staticize. */ 88 char *bootSTK; 89 int bootAP; 90 91 /* Free these after use */ 92 void *bootstacks[MAXCPU]; 93 void *dpcpu; 94 95 struct pcb stoppcbs[MAXCPU]; 96 struct susppcb **susppcbs; 97 98 #ifdef COUNT_IPIS 99 /* Interrupt counts. */ 100 static u_long *ipi_preempt_counts[MAXCPU]; 101 static u_long *ipi_ast_counts[MAXCPU]; 102 u_long *ipi_invltlb_counts[MAXCPU]; 103 u_long *ipi_invlrng_counts[MAXCPU]; 104 u_long *ipi_invlpg_counts[MAXCPU]; 105 u_long *ipi_invlcache_counts[MAXCPU]; 106 u_long *ipi_rendezvous_counts[MAXCPU]; 107 static u_long *ipi_hardclock_counts[MAXCPU]; 108 #endif 109 110 /* Default cpu_ops implementation. */ 111 struct cpu_ops cpu_ops; 112 113 /* 114 * Local data and functions. 115 */ 116 117 static volatile cpuset_t ipi_stop_nmi_pending; 118 119 volatile cpuset_t resuming_cpus; 120 volatile cpuset_t toresume_cpus; 121 122 /* used to hold the AP's until we are ready to release them */ 123 struct mtx ap_boot_mtx; 124 125 /* Set to 1 once we're ready to let the APs out of the pen. */ 126 volatile int aps_ready = 0; 127 128 /* 129 * Store data from cpu_add() until later in the boot when we actually setup 130 * the APs. 131 */ 132 struct cpu_info *cpu_info; 133 int *apic_cpuids; 134 int cpu_apic_ids[MAXCPU]; 135 _Static_assert(MAXCPU <= MAX_APIC_ID, 136 "MAXCPU cannot be larger that MAX_APIC_ID"); 137 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, 138 "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); 139 140 /* Holds pending bitmap based IPIs per CPU */ 141 volatile u_int cpu_ipi_pending[MAXCPU]; 142 143 static void release_aps(void *dummy); 144 static void cpustop_handler_post(u_int cpu); 145 146 static int hyperthreading_allowed = 1; 147 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 148 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 149 150 static struct topo_node topo_root; 151 152 static int pkg_id_shift; 153 static int node_id_shift; 154 static int core_id_shift; 155 static int disabled_cpus; 156 157 struct cache_info { 158 int id_shift; 159 int present; 160 } static caches[MAX_CACHE_LEVELS]; 161 162 unsigned int boot_address; 163 164 #define MiB(v) (v ## ULL << 20) 165 166 void 167 mem_range_AP_init(void) 168 { 169 170 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 171 mem_range_softc.mr_op->initAP(&mem_range_softc); 172 } 173 174 /* 175 * Round up to the next power of two, if necessary, and then 176 * take log2. 177 * Returns -1 if argument is zero. 178 */ 179 static __inline int 180 mask_width(u_int x) 181 { 182 183 return (fls(x << (1 - powerof2(x))) - 1); 184 } 185 186 /* 187 * Add a cache level to the cache topology description. 188 */ 189 static int 190 add_deterministic_cache(int type, int level, int share_count) 191 { 192 193 if (type == 0) 194 return (0); 195 if (type > 3) { 196 printf("unexpected cache type %d\n", type); 197 return (1); 198 } 199 if (type == 2) /* ignore instruction cache */ 200 return (1); 201 if (level == 0 || level > MAX_CACHE_LEVELS) { 202 printf("unexpected cache level %d\n", type); 203 return (1); 204 } 205 206 if (caches[level - 1].present) { 207 printf("WARNING: multiple entries for L%u data cache\n", level); 208 printf("%u => %u\n", caches[level - 1].id_shift, 209 mask_width(share_count)); 210 } 211 caches[level - 1].id_shift = mask_width(share_count); 212 caches[level - 1].present = 1; 213 214 if (caches[level - 1].id_shift > pkg_id_shift) { 215 printf("WARNING: L%u data cache covers more " 216 "APIC IDs than a package (%u > %u)\n", level, 217 caches[level - 1].id_shift, pkg_id_shift); 218 caches[level - 1].id_shift = pkg_id_shift; 219 } 220 if (caches[level - 1].id_shift < core_id_shift) { 221 printf("WARNING: L%u data cache covers fewer " 222 "APIC IDs than a core (%u < %u)\n", level, 223 caches[level - 1].id_shift, core_id_shift); 224 caches[level - 1].id_shift = core_id_shift; 225 } 226 227 return (1); 228 } 229 230 /* 231 * Determine topology of processing units and caches for AMD CPUs. 232 * See: 233 * - AMD CPUID Specification (Publication # 25481) 234 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 235 * - BKDG For AMD Family 10h Processors (Publication # 31116) 236 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 237 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 238 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) 239 */ 240 static void 241 topo_probe_amd(void) 242 { 243 u_int p[4]; 244 uint64_t v; 245 int level; 246 int nodes_per_socket; 247 int share_count; 248 int type; 249 int i; 250 251 /* No multi-core capability. */ 252 if ((amd_feature2 & AMDID2_CMP) == 0) 253 return; 254 255 /* For families 10h and newer. */ 256 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 257 AMDID_COREID_SIZE_SHIFT; 258 259 /* For 0Fh family. */ 260 if (pkg_id_shift == 0) 261 pkg_id_shift = 262 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 263 264 /* 265 * Families prior to 16h define the following value as 266 * cores per compute unit and we don't really care about the AMD 267 * compute units at the moment. Perhaps we should treat them as 268 * cores and cores within the compute units as hardware threads, 269 * but that's up for debate. 270 * Later families define the value as threads per compute unit, 271 * so we are following AMD's nomenclature here. 272 */ 273 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 274 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 275 cpuid_count(0x8000001e, 0, p); 276 share_count = ((p[1] >> 8) & 0xff) + 1; 277 core_id_shift = mask_width(share_count); 278 279 /* 280 * For Zen (17h), gather Nodes per Processor. Each node is a 281 * Zeppelin die; TR and EPYC CPUs will have multiple dies per 282 * package. Communication latency between dies is higher than 283 * within them. 284 */ 285 nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; 286 node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); 287 } 288 289 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 290 for (i = 0; ; i++) { 291 cpuid_count(0x8000001d, i, p); 292 type = p[0] & 0x1f; 293 level = (p[0] >> 5) & 0x7; 294 share_count = 1 + ((p[0] >> 14) & 0xfff); 295 296 if (!add_deterministic_cache(type, level, share_count)) 297 break; 298 } 299 } else { 300 if (cpu_exthigh >= 0x80000005) { 301 cpuid_count(0x80000005, 0, p); 302 if (((p[2] >> 24) & 0xff) != 0) { 303 caches[0].id_shift = 0; 304 caches[0].present = 1; 305 } 306 } 307 if (cpu_exthigh >= 0x80000006) { 308 cpuid_count(0x80000006, 0, p); 309 if (((p[2] >> 16) & 0xffff) != 0) { 310 caches[1].id_shift = 0; 311 caches[1].present = 1; 312 } 313 if (((p[3] >> 18) & 0x3fff) != 0) { 314 nodes_per_socket = 1; 315 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 316 /* 317 * Handle multi-node processors that 318 * have multiple chips, each with its 319 * own L3 cache, on the same die. 320 */ 321 v = rdmsr(0xc001100c); 322 nodes_per_socket = 1 + ((v >> 3) & 0x7); 323 } 324 caches[2].id_shift = 325 pkg_id_shift - mask_width(nodes_per_socket); 326 caches[2].present = 1; 327 } 328 } 329 } 330 } 331 332 /* 333 * Determine topology of processing units for Intel CPUs 334 * using CPUID Leaf 1 and Leaf 4, if supported. 335 * See: 336 * - Intel 64 Architecture Processor Topology Enumeration 337 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 338 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 339 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 340 */ 341 static void 342 topo_probe_intel_0x4(void) 343 { 344 u_int p[4]; 345 int max_cores; 346 int max_logical; 347 348 /* Both zero and one here mean one logical processor per package. */ 349 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 350 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 351 if (max_logical <= 1) 352 return; 353 354 if (cpu_high >= 0x4) { 355 cpuid_count(0x04, 0, p); 356 max_cores = ((p[0] >> 26) & 0x3f) + 1; 357 } else 358 max_cores = 1; 359 360 core_id_shift = mask_width(max_logical/max_cores); 361 KASSERT(core_id_shift >= 0, 362 ("intel topo: max_cores > max_logical\n")); 363 pkg_id_shift = core_id_shift + mask_width(max_cores); 364 } 365 366 /* 367 * Determine topology of processing units for Intel CPUs 368 * using CPUID Leaf 11, if supported. 369 * See: 370 * - Intel 64 Architecture Processor Topology Enumeration 371 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 372 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 373 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 374 */ 375 static void 376 topo_probe_intel_0xb(void) 377 { 378 u_int p[4]; 379 int bits; 380 int type; 381 int i; 382 383 /* Fall back if CPU leaf 11 doesn't really exist. */ 384 cpuid_count(0x0b, 0, p); 385 if (p[1] == 0) { 386 topo_probe_intel_0x4(); 387 return; 388 } 389 390 /* We only support three levels for now. */ 391 for (i = 0; ; i++) { 392 cpuid_count(0x0b, i, p); 393 394 bits = p[0] & 0x1f; 395 type = (p[2] >> 8) & 0xff; 396 397 if (type == 0) 398 break; 399 400 /* TODO: check for duplicate (re-)assignment */ 401 if (type == CPUID_TYPE_SMT) 402 core_id_shift = bits; 403 else if (type == CPUID_TYPE_CORE) 404 pkg_id_shift = bits; 405 else 406 printf("unknown CPU level type %d\n", type); 407 } 408 409 if (pkg_id_shift < core_id_shift) { 410 printf("WARNING: core covers more APIC IDs than a package\n"); 411 core_id_shift = pkg_id_shift; 412 } 413 } 414 415 /* 416 * Determine topology of caches for Intel CPUs. 417 * See: 418 * - Intel 64 Architecture Processor Topology Enumeration 419 * - Intel 64 and IA-32 Architectures Software Developer’s Manual 420 * Volume 2A: Instruction Set Reference, A-M, 421 * CPUID instruction 422 */ 423 static void 424 topo_probe_intel_caches(void) 425 { 426 u_int p[4]; 427 int level; 428 int share_count; 429 int type; 430 int i; 431 432 if (cpu_high < 0x4) { 433 /* 434 * Available cache level and sizes can be determined 435 * via CPUID leaf 2, but that requires a huge table of hardcoded 436 * values, so for now just assume L1 and L2 caches potentially 437 * shared only by HTT processing units, if HTT is present. 438 */ 439 caches[0].id_shift = pkg_id_shift; 440 caches[0].present = 1; 441 caches[1].id_shift = pkg_id_shift; 442 caches[1].present = 1; 443 return; 444 } 445 446 for (i = 0; ; i++) { 447 cpuid_count(0x4, i, p); 448 type = p[0] & 0x1f; 449 level = (p[0] >> 5) & 0x7; 450 share_count = 1 + ((p[0] >> 14) & 0xfff); 451 452 if (!add_deterministic_cache(type, level, share_count)) 453 break; 454 } 455 } 456 457 /* 458 * Determine topology of processing units and caches for Intel CPUs. 459 * See: 460 * - Intel 64 Architecture Processor Topology Enumeration 461 */ 462 static void 463 topo_probe_intel(void) 464 { 465 466 /* 467 * Note that 0x1 <= cpu_high < 4 case should be 468 * compatible with topo_probe_intel_0x4() logic when 469 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 470 * or it should trigger the fallback otherwise. 471 */ 472 if (cpu_high >= 0xb) 473 topo_probe_intel_0xb(); 474 else if (cpu_high >= 0x1) 475 topo_probe_intel_0x4(); 476 477 topo_probe_intel_caches(); 478 } 479 480 /* 481 * Topology information is queried only on BSP, on which this 482 * code runs and for which it can query CPUID information. 483 * Then topology is extrapolated on all packages using an 484 * assumption that APIC ID to hardware component ID mapping is 485 * homogenious. 486 * That doesn't necesserily imply that the topology is uniform. 487 */ 488 void 489 topo_probe(void) 490 { 491 static int cpu_topo_probed = 0; 492 struct x86_topo_layer { 493 int type; 494 int subtype; 495 int id_shift; 496 } topo_layers[MAX_CACHE_LEVELS + 4]; 497 struct topo_node *parent; 498 struct topo_node *node; 499 int layer; 500 int nlayers; 501 int node_id; 502 int i; 503 504 if (cpu_topo_probed) 505 return; 506 507 CPU_ZERO(&logical_cpus_mask); 508 509 if (mp_ncpus <= 1) 510 ; /* nothing */ 511 else if (cpu_vendor_id == CPU_VENDOR_AMD) 512 topo_probe_amd(); 513 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 514 topo_probe_intel(); 515 516 KASSERT(pkg_id_shift >= core_id_shift, 517 ("bug in APIC topology discovery")); 518 519 nlayers = 0; 520 bzero(topo_layers, sizeof(topo_layers)); 521 522 topo_layers[nlayers].type = TOPO_TYPE_PKG; 523 topo_layers[nlayers].id_shift = pkg_id_shift; 524 if (bootverbose) 525 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 526 nlayers++; 527 528 if (pkg_id_shift > node_id_shift && node_id_shift != 0) { 529 topo_layers[nlayers].type = TOPO_TYPE_GROUP; 530 topo_layers[nlayers].id_shift = node_id_shift; 531 if (bootverbose) 532 printf("Node ID shift: %u\n", 533 topo_layers[nlayers].id_shift); 534 nlayers++; 535 } 536 537 /* 538 * Consider all caches to be within a package/chip 539 * and "in front" of all sub-components like 540 * cores and hardware threads. 541 */ 542 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 543 if (caches[i].present) { 544 if (node_id_shift != 0) 545 KASSERT(caches[i].id_shift <= node_id_shift, 546 ("bug in APIC topology discovery")); 547 KASSERT(caches[i].id_shift <= pkg_id_shift, 548 ("bug in APIC topology discovery")); 549 KASSERT(caches[i].id_shift >= core_id_shift, 550 ("bug in APIC topology discovery")); 551 552 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 553 topo_layers[nlayers].subtype = i + 1; 554 topo_layers[nlayers].id_shift = caches[i].id_shift; 555 if (bootverbose) 556 printf("L%u cache ID shift: %u\n", 557 topo_layers[nlayers].subtype, 558 topo_layers[nlayers].id_shift); 559 nlayers++; 560 } 561 } 562 563 if (pkg_id_shift > core_id_shift) { 564 topo_layers[nlayers].type = TOPO_TYPE_CORE; 565 topo_layers[nlayers].id_shift = core_id_shift; 566 if (bootverbose) 567 printf("Core ID shift: %u\n", 568 topo_layers[nlayers].id_shift); 569 nlayers++; 570 } 571 572 topo_layers[nlayers].type = TOPO_TYPE_PU; 573 topo_layers[nlayers].id_shift = 0; 574 nlayers++; 575 576 topo_init_root(&topo_root); 577 for (i = 0; i <= max_apic_id; ++i) { 578 if (!cpu_info[i].cpu_present) 579 continue; 580 581 parent = &topo_root; 582 for (layer = 0; layer < nlayers; ++layer) { 583 node_id = i >> topo_layers[layer].id_shift; 584 parent = topo_add_node_by_hwid(parent, node_id, 585 topo_layers[layer].type, 586 topo_layers[layer].subtype); 587 } 588 } 589 590 parent = &topo_root; 591 for (layer = 0; layer < nlayers; ++layer) { 592 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 593 node = topo_find_node_by_hwid(parent, node_id, 594 topo_layers[layer].type, 595 topo_layers[layer].subtype); 596 topo_promote_child(node); 597 parent = node; 598 } 599 600 cpu_topo_probed = 1; 601 } 602 603 /* 604 * Assign logical CPU IDs to local APICs. 605 */ 606 void 607 assign_cpu_ids(void) 608 { 609 struct topo_node *node; 610 u_int smt_mask; 611 int nhyper; 612 613 smt_mask = (1u << core_id_shift) - 1; 614 615 /* 616 * Assign CPU IDs to local APIC IDs and disable any CPUs 617 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 618 */ 619 mp_ncpus = 0; 620 nhyper = 0; 621 TOPO_FOREACH(node, &topo_root) { 622 if (node->type != TOPO_TYPE_PU) 623 continue; 624 625 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 626 cpu_info[node->hwid].cpu_hyperthread = 1; 627 628 if (resource_disabled("lapic", node->hwid)) { 629 if (node->hwid != boot_cpu_id) 630 cpu_info[node->hwid].cpu_disabled = 1; 631 else 632 printf("Cannot disable BSP, APIC ID = %d\n", 633 node->hwid); 634 } 635 636 if (!hyperthreading_allowed && 637 cpu_info[node->hwid].cpu_hyperthread) 638 cpu_info[node->hwid].cpu_disabled = 1; 639 640 if (mp_ncpus >= MAXCPU) 641 cpu_info[node->hwid].cpu_disabled = 1; 642 643 if (cpu_info[node->hwid].cpu_disabled) { 644 disabled_cpus++; 645 continue; 646 } 647 648 if (cpu_info[node->hwid].cpu_hyperthread) 649 nhyper++; 650 651 cpu_apic_ids[mp_ncpus] = node->hwid; 652 apic_cpuids[node->hwid] = mp_ncpus; 653 topo_set_pu_id(node, mp_ncpus); 654 mp_ncpus++; 655 } 656 657 KASSERT(mp_maxid >= mp_ncpus - 1, 658 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 659 mp_ncpus)); 660 661 mp_ncores = mp_ncpus - nhyper; 662 smp_threads_per_core = mp_ncpus / mp_ncores; 663 } 664 665 /* 666 * Print various information about the SMP system hardware and setup. 667 */ 668 void 669 cpu_mp_announce(void) 670 { 671 struct topo_node *node; 672 const char *hyperthread; 673 struct topo_analysis topology; 674 675 printf("FreeBSD/SMP: "); 676 if (topo_analyze(&topo_root, 1, &topology)) { 677 printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); 678 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 679 printf(" x %d groups", 680 topology.entities[TOPO_LEVEL_GROUP]); 681 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 682 printf(" x %d cache groups", 683 topology.entities[TOPO_LEVEL_CACHEGROUP]); 684 if (topology.entities[TOPO_LEVEL_CORE] > 0) 685 printf(" x %d core(s)", 686 topology.entities[TOPO_LEVEL_CORE]); 687 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 688 printf(" x %d hardware threads", 689 topology.entities[TOPO_LEVEL_THREAD]); 690 } else { 691 printf("Non-uniform topology"); 692 } 693 printf("\n"); 694 695 if (disabled_cpus) { 696 printf("FreeBSD/SMP Online: "); 697 if (topo_analyze(&topo_root, 0, &topology)) { 698 printf("%d package(s)", 699 topology.entities[TOPO_LEVEL_PKG]); 700 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 701 printf(" x %d groups", 702 topology.entities[TOPO_LEVEL_GROUP]); 703 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 704 printf(" x %d cache groups", 705 topology.entities[TOPO_LEVEL_CACHEGROUP]); 706 if (topology.entities[TOPO_LEVEL_CORE] > 0) 707 printf(" x %d core(s)", 708 topology.entities[TOPO_LEVEL_CORE]); 709 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 710 printf(" x %d hardware threads", 711 topology.entities[TOPO_LEVEL_THREAD]); 712 } else { 713 printf("Non-uniform topology"); 714 } 715 printf("\n"); 716 } 717 718 if (!bootverbose) 719 return; 720 721 TOPO_FOREACH(node, &topo_root) { 722 switch (node->type) { 723 case TOPO_TYPE_PKG: 724 printf("Package HW ID = %u\n", node->hwid); 725 break; 726 case TOPO_TYPE_CORE: 727 printf("\tCore HW ID = %u\n", node->hwid); 728 break; 729 case TOPO_TYPE_PU: 730 if (cpu_info[node->hwid].cpu_hyperthread) 731 hyperthread = "/HT"; 732 else 733 hyperthread = ""; 734 735 if (node->subtype == 0) 736 printf("\t\tCPU (AP%s): APIC ID: %u" 737 "(disabled)\n", hyperthread, node->hwid); 738 else if (node->id == 0) 739 printf("\t\tCPU0 (BSP): APIC ID: %u\n", 740 node->hwid); 741 else 742 printf("\t\tCPU%u (AP%s): APIC ID: %u\n", 743 node->id, hyperthread, node->hwid); 744 break; 745 default: 746 /* ignored */ 747 break; 748 } 749 } 750 } 751 752 /* 753 * Add a scheduling group, a group of logical processors sharing 754 * a particular cache (and, thus having an affinity), to the scheduling 755 * topology. 756 * This function recursively works on lower level caches. 757 */ 758 static void 759 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 760 { 761 struct topo_node *node; 762 int nchildren; 763 int ncores; 764 int i; 765 766 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || 767 root->type == TOPO_TYPE_GROUP, 768 ("x86topo_add_sched_group: bad type: %u", root->type)); 769 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 770 cg_root->cg_count = root->cpu_count; 771 if (root->type == TOPO_TYPE_SYSTEM) 772 cg_root->cg_level = CG_SHARE_NONE; 773 else 774 cg_root->cg_level = root->subtype; 775 776 /* 777 * Check how many core nodes we have under the given root node. 778 * If we have multiple logical processors, but not multiple 779 * cores, then those processors must be hardware threads. 780 */ 781 ncores = 0; 782 node = root; 783 while (node != NULL) { 784 if (node->type != TOPO_TYPE_CORE) { 785 node = topo_next_node(root, node); 786 continue; 787 } 788 789 ncores++; 790 node = topo_next_nonchild_node(root, node); 791 } 792 793 if (cg_root->cg_level != CG_SHARE_NONE && 794 root->cpu_count > 1 && ncores < 2) 795 cg_root->cg_flags = CG_FLAG_SMT; 796 797 /* 798 * Find out how many cache nodes we have under the given root node. 799 * We ignore cache nodes that cover all the same processors as the 800 * root node. Also, we do not descend below found cache nodes. 801 * That is, we count top-level "non-redundant" caches under the root 802 * node. 803 */ 804 nchildren = 0; 805 node = root; 806 while (node != NULL) { 807 if ((node->type != TOPO_TYPE_GROUP && 808 node->type != TOPO_TYPE_CACHE) || 809 (root->type != TOPO_TYPE_SYSTEM && 810 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 811 node = topo_next_node(root, node); 812 continue; 813 } 814 nchildren++; 815 node = topo_next_nonchild_node(root, node); 816 } 817 818 cg_root->cg_child = smp_topo_alloc(nchildren); 819 cg_root->cg_children = nchildren; 820 821 /* 822 * Now find again the same cache nodes as above and recursively 823 * build scheduling topologies for them. 824 */ 825 node = root; 826 i = 0; 827 while (node != NULL) { 828 if ((node->type != TOPO_TYPE_GROUP && 829 node->type != TOPO_TYPE_CACHE) || 830 (root->type != TOPO_TYPE_SYSTEM && 831 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 832 node = topo_next_node(root, node); 833 continue; 834 } 835 cg_root->cg_child[i].cg_parent = cg_root; 836 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 837 i++; 838 node = topo_next_nonchild_node(root, node); 839 } 840 } 841 842 /* 843 * Build the MI scheduling topology from the discovered hardware topology. 844 */ 845 struct cpu_group * 846 cpu_topo(void) 847 { 848 struct cpu_group *cg_root; 849 850 if (mp_ncpus <= 1) 851 return (smp_topo_none()); 852 853 cg_root = smp_topo_alloc(1); 854 x86topo_add_sched_group(&topo_root, cg_root); 855 return (cg_root); 856 } 857 858 static void 859 cpu_alloc(void *dummy __unused) 860 { 861 /* 862 * Dynamically allocate the arrays that depend on the 863 * maximum APIC ID. 864 */ 865 cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, 866 M_WAITOK | M_ZERO); 867 apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, 868 M_WAITOK | M_ZERO); 869 } 870 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); 871 872 /* 873 * Add a logical CPU to the topology. 874 */ 875 void 876 cpu_add(u_int apic_id, char boot_cpu) 877 { 878 879 if (apic_id > max_apic_id) { 880 panic("SMP: APIC ID %d too high", apic_id); 881 return; 882 } 883 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", 884 apic_id)); 885 cpu_info[apic_id].cpu_present = 1; 886 if (boot_cpu) { 887 KASSERT(boot_cpu_id == -1, 888 ("CPU %u claims to be BSP, but CPU %u already is", apic_id, 889 boot_cpu_id)); 890 boot_cpu_id = apic_id; 891 cpu_info[apic_id].cpu_bsp = 1; 892 } 893 if (bootverbose) 894 printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : 895 "AP"); 896 } 897 898 void 899 cpu_mp_setmaxid(void) 900 { 901 902 /* 903 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 904 * If there were no calls to cpu_add() assume this is a UP system. 905 */ 906 if (mp_ncpus == 0) 907 mp_ncpus = 1; 908 } 909 910 int 911 cpu_mp_probe(void) 912 { 913 914 /* 915 * Always record BSP in CPU map so that the mbuf init code works 916 * correctly. 917 */ 918 CPU_SETOF(0, &all_cpus); 919 return (mp_ncpus > 1); 920 } 921 922 /* Allocate memory for the AP trampoline. */ 923 void 924 alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx) 925 { 926 unsigned int i; 927 bool allocated; 928 929 allocated = false; 930 for (i = *physmap_idx; i <= *physmap_idx; i -= 2) { 931 /* 932 * Find a memory region big enough and below the 1MB boundary 933 * for the trampoline code. 934 * NB: needs to be page aligned. 935 */ 936 if (physmap[i] >= MiB(1) || 937 (trunc_page(physmap[i + 1]) - round_page(physmap[i])) < 938 round_page(bootMP_size)) 939 continue; 940 941 allocated = true; 942 /* 943 * Try to steal from the end of the region to mimic previous 944 * behaviour, else fallback to steal from the start. 945 */ 946 if (physmap[i + 1] < MiB(1)) { 947 boot_address = trunc_page(physmap[i + 1]); 948 if ((physmap[i + 1] - boot_address) < bootMP_size) 949 boot_address -= round_page(bootMP_size); 950 physmap[i + 1] = boot_address; 951 } else { 952 boot_address = round_page(physmap[i]); 953 physmap[i] = boot_address + round_page(bootMP_size); 954 } 955 if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) { 956 memmove(&physmap[i], &physmap[i + 2], 957 sizeof(*physmap) * (*physmap_idx - i + 2)); 958 *physmap_idx -= 2; 959 } 960 break; 961 } 962 963 if (!allocated) { 964 boot_address = basemem * 1024 - bootMP_size; 965 if (bootverbose) 966 printf( 967 "Cannot find enough space for the boot trampoline, placing it at %#x", 968 boot_address); 969 } 970 } 971 972 /* 973 * AP CPU's call this to initialize themselves. 974 */ 975 void 976 init_secondary_tail(void) 977 { 978 u_int cpuid; 979 980 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); 981 982 /* 983 * On real hardware, switch to x2apic mode if possible. Do it 984 * after aps_ready was signalled, to avoid manipulating the 985 * mode while BSP might still want to send some IPI to us 986 * (second startup IPI is ignored on modern hardware etc). 987 */ 988 lapic_xapic_mode(); 989 990 /* Initialize the PAT MSR. */ 991 pmap_init_pat(); 992 993 /* set up CPU registers and state */ 994 cpu_setregs(); 995 996 /* set up SSE/NX */ 997 initializecpu(); 998 999 /* set up FPU state on the AP */ 1000 #ifdef __amd64__ 1001 fpuinit(); 1002 #else 1003 npxinit(false); 1004 #endif 1005 1006 if (cpu_ops.cpu_init) 1007 cpu_ops.cpu_init(); 1008 1009 /* A quick check from sanity claus */ 1010 cpuid = PCPU_GET(cpuid); 1011 if (PCPU_GET(apic_id) != lapic_id()) { 1012 printf("SMP: cpuid = %d\n", cpuid); 1013 printf("SMP: actual apic_id = %d\n", lapic_id()); 1014 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 1015 panic("cpuid mismatch! boom!!"); 1016 } 1017 1018 /* Initialize curthread. */ 1019 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 1020 PCPU_SET(curthread, PCPU_GET(idlethread)); 1021 1022 mtx_lock_spin(&ap_boot_mtx); 1023 1024 mca_init(); 1025 1026 /* Init local apic for irq's */ 1027 lapic_setup(1); 1028 1029 /* Set memory range attributes for this CPU to match the BSP */ 1030 mem_range_AP_init(); 1031 1032 smp_cpus++; 1033 1034 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 1035 if (bootverbose) 1036 printf("SMP: AP CPU #%d Launched!\n", cpuid); 1037 else 1038 printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", 1039 cpuid, smp_cpus == mp_ncpus ? "\n" : " "); 1040 1041 /* Determine if we are a logical CPU. */ 1042 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 1043 CPU_SET(cpuid, &logical_cpus_mask); 1044 1045 if (bootverbose) 1046 lapic_dump("AP"); 1047 1048 if (smp_cpus == mp_ncpus) { 1049 /* enable IPI's, tlb shootdown, freezes etc */ 1050 atomic_store_rel_int(&smp_started, 1); 1051 } 1052 1053 #ifdef __amd64__ 1054 /* 1055 * Enable global pages TLB extension 1056 * This also implicitly flushes the TLB 1057 */ 1058 load_cr4(rcr4() | CR4_PGE); 1059 if (pmap_pcid_enabled) 1060 load_cr4(rcr4() | CR4_PCIDE); 1061 load_ds(_udatasel); 1062 load_es(_udatasel); 1063 load_fs(_ufssel); 1064 #endif 1065 1066 mtx_unlock_spin(&ap_boot_mtx); 1067 1068 /* Wait until all the AP's are up. */ 1069 while (atomic_load_acq_int(&smp_started) == 0) 1070 ia32_pause(); 1071 1072 #ifndef EARLY_AP_STARTUP 1073 /* Start per-CPU event timers. */ 1074 cpu_initclocks_ap(); 1075 #endif 1076 1077 sched_throw(NULL); 1078 1079 panic("scheduler returned us to %s", __func__); 1080 /* NOTREACHED */ 1081 } 1082 1083 static void 1084 smp_after_idle_runnable(void *arg __unused) 1085 { 1086 struct thread *idle_td; 1087 int cpu; 1088 1089 for (cpu = 1; cpu < mp_ncpus; cpu++) { 1090 idle_td = pcpu_find(cpu)->pc_idlethread; 1091 while (atomic_load_int(&idle_td->td_lastcpu) == NOCPU && 1092 atomic_load_int(&idle_td->td_oncpu) == NOCPU) 1093 cpu_spinwait(); 1094 kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages * 1095 PAGE_SIZE); 1096 } 1097 } 1098 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, 1099 smp_after_idle_runnable, NULL); 1100 1101 /* 1102 * We tell the I/O APIC code about all the CPUs we want to receive 1103 * interrupts. If we don't want certain CPUs to receive IRQs we 1104 * can simply not tell the I/O APIC code about them in this function. 1105 * We also do not tell it about the BSP since it tells itself about 1106 * the BSP internally to work with UP kernels and on UP machines. 1107 */ 1108 void 1109 set_interrupt_apic_ids(void) 1110 { 1111 u_int i, apic_id; 1112 1113 for (i = 0; i < MAXCPU; i++) { 1114 apic_id = cpu_apic_ids[i]; 1115 if (apic_id == -1) 1116 continue; 1117 if (cpu_info[apic_id].cpu_bsp) 1118 continue; 1119 if (cpu_info[apic_id].cpu_disabled) 1120 continue; 1121 1122 /* Don't let hyperthreads service interrupts. */ 1123 if (cpu_info[apic_id].cpu_hyperthread) 1124 continue; 1125 1126 intr_add_cpu(i); 1127 } 1128 } 1129 1130 1131 #ifdef COUNT_XINVLTLB_HITS 1132 u_int xhits_gbl[MAXCPU]; 1133 u_int xhits_pg[MAXCPU]; 1134 u_int xhits_rng[MAXCPU]; 1135 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 1136 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1137 sizeof(xhits_gbl), "IU", ""); 1138 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1139 sizeof(xhits_pg), "IU", ""); 1140 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1141 sizeof(xhits_rng), "IU", ""); 1142 1143 u_int ipi_global; 1144 u_int ipi_page; 1145 u_int ipi_range; 1146 u_int ipi_range_size; 1147 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1148 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1149 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1150 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1151 0, ""); 1152 #endif /* COUNT_XINVLTLB_HITS */ 1153 1154 /* 1155 * Init and startup IPI. 1156 */ 1157 void 1158 ipi_startup(int apic_id, int vector) 1159 { 1160 1161 /* 1162 * This attempts to follow the algorithm described in the 1163 * Intel Multiprocessor Specification v1.4 in section B.4. 1164 * For each IPI, we allow the local APIC ~20us to deliver the 1165 * IPI. If that times out, we panic. 1166 */ 1167 1168 /* 1169 * first we do an INIT IPI: this INIT IPI might be run, resetting 1170 * and running the target CPU. OR this INIT IPI might be latched (P5 1171 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1172 * ignored. 1173 */ 1174 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1175 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1176 lapic_ipi_wait(100); 1177 1178 /* Explicitly deassert the INIT IPI. */ 1179 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1180 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1181 apic_id); 1182 1183 DELAY(10000); /* wait ~10mS */ 1184 1185 /* 1186 * next we do a STARTUP IPI: the previous INIT IPI might still be 1187 * latched, (P5 bug) this 1st STARTUP would then terminate 1188 * immediately, and the previously started INIT IPI would continue. OR 1189 * the previous INIT IPI has already run. and this STARTUP IPI will 1190 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1191 * will run. 1192 */ 1193 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1194 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1195 vector, apic_id); 1196 if (!lapic_ipi_wait(100)) 1197 panic("Failed to deliver first STARTUP IPI to APIC %d", 1198 apic_id); 1199 DELAY(200); /* wait ~200uS */ 1200 1201 /* 1202 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1203 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1204 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1205 * recognized after hardware RESET or INIT IPI. 1206 */ 1207 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1208 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1209 vector, apic_id); 1210 if (!lapic_ipi_wait(100)) 1211 panic("Failed to deliver second STARTUP IPI to APIC %d", 1212 apic_id); 1213 1214 DELAY(200); /* wait ~200uS */ 1215 } 1216 1217 /* 1218 * Send an IPI to specified CPU handling the bitmap logic. 1219 */ 1220 void 1221 ipi_send_cpu(int cpu, u_int ipi) 1222 { 1223 u_int bitmap, old_pending, new_pending; 1224 1225 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1226 1227 if (IPI_IS_BITMAPED(ipi)) { 1228 bitmap = 1 << ipi; 1229 ipi = IPI_BITMAP_VECTOR; 1230 do { 1231 old_pending = cpu_ipi_pending[cpu]; 1232 new_pending = old_pending | bitmap; 1233 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1234 old_pending, new_pending)); 1235 if (old_pending) 1236 return; 1237 } 1238 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1239 } 1240 1241 void 1242 ipi_bitmap_handler(struct trapframe frame) 1243 { 1244 struct trapframe *oldframe; 1245 struct thread *td; 1246 int cpu = PCPU_GET(cpuid); 1247 u_int ipi_bitmap; 1248 1249 critical_enter(); 1250 td = curthread; 1251 td->td_intr_nesting_level++; 1252 oldframe = td->td_intr_frame; 1253 td->td_intr_frame = &frame; 1254 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1255 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1256 #ifdef COUNT_IPIS 1257 (*ipi_preempt_counts[cpu])++; 1258 #endif 1259 sched_preempt(td); 1260 } 1261 if (ipi_bitmap & (1 << IPI_AST)) { 1262 #ifdef COUNT_IPIS 1263 (*ipi_ast_counts[cpu])++; 1264 #endif 1265 /* Nothing to do for AST */ 1266 } 1267 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1268 #ifdef COUNT_IPIS 1269 (*ipi_hardclock_counts[cpu])++; 1270 #endif 1271 hardclockintr(); 1272 } 1273 td->td_intr_frame = oldframe; 1274 td->td_intr_nesting_level--; 1275 critical_exit(); 1276 } 1277 1278 /* 1279 * send an IPI to a set of cpus. 1280 */ 1281 void 1282 ipi_selected(cpuset_t cpus, u_int ipi) 1283 { 1284 int cpu; 1285 1286 /* 1287 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1288 * of help in order to understand what is the source. 1289 * Set the mask of receiving CPUs for this purpose. 1290 */ 1291 if (ipi == IPI_STOP_HARD) 1292 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1293 1294 while ((cpu = CPU_FFS(&cpus)) != 0) { 1295 cpu--; 1296 CPU_CLR(cpu, &cpus); 1297 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1298 ipi_send_cpu(cpu, ipi); 1299 } 1300 } 1301 1302 /* 1303 * send an IPI to a specific CPU. 1304 */ 1305 void 1306 ipi_cpu(int cpu, u_int ipi) 1307 { 1308 1309 /* 1310 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1311 * of help in order to understand what is the source. 1312 * Set the mask of receiving CPUs for this purpose. 1313 */ 1314 if (ipi == IPI_STOP_HARD) 1315 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1316 1317 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1318 ipi_send_cpu(cpu, ipi); 1319 } 1320 1321 /* 1322 * send an IPI to all CPUs EXCEPT myself 1323 */ 1324 void 1325 ipi_all_but_self(u_int ipi) 1326 { 1327 cpuset_t other_cpus; 1328 1329 other_cpus = all_cpus; 1330 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1331 if (IPI_IS_BITMAPED(ipi)) { 1332 ipi_selected(other_cpus, ipi); 1333 return; 1334 } 1335 1336 /* 1337 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1338 * of help in order to understand what is the source. 1339 * Set the mask of receiving CPUs for this purpose. 1340 */ 1341 if (ipi == IPI_STOP_HARD) 1342 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1343 1344 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1345 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1346 } 1347 1348 int 1349 ipi_nmi_handler(void) 1350 { 1351 u_int cpuid; 1352 1353 /* 1354 * As long as there is not a simple way to know about a NMI's 1355 * source, if the bitmask for the current CPU is present in 1356 * the global pending bitword an IPI_STOP_HARD has been issued 1357 * and should be handled. 1358 */ 1359 cpuid = PCPU_GET(cpuid); 1360 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1361 return (1); 1362 1363 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1364 cpustop_handler(); 1365 return (0); 1366 } 1367 1368 int nmi_kdb_lock; 1369 1370 void 1371 nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1372 { 1373 int cpu; 1374 bool call_post; 1375 1376 cpu = PCPU_GET(cpuid); 1377 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1378 nmi_call_kdb(cpu, type, frame); 1379 call_post = false; 1380 } else { 1381 savectx(&stoppcbs[cpu]); 1382 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1383 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1384 ia32_pause(); 1385 call_post = true; 1386 } 1387 atomic_store_rel_int(&nmi_kdb_lock, 0); 1388 if (call_post) 1389 cpustop_handler_post(cpu); 1390 } 1391 1392 /* 1393 * Handle an IPI_STOP by saving our current context and spinning until we 1394 * are resumed. 1395 */ 1396 void 1397 cpustop_handler(void) 1398 { 1399 u_int cpu; 1400 1401 cpu = PCPU_GET(cpuid); 1402 1403 savectx(&stoppcbs[cpu]); 1404 1405 /* Indicate that we are stopped */ 1406 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1407 1408 /* Wait for restart */ 1409 while (!CPU_ISSET(cpu, &started_cpus)) 1410 ia32_pause(); 1411 1412 cpustop_handler_post(cpu); 1413 } 1414 1415 static void 1416 cpustop_handler_post(u_int cpu) 1417 { 1418 1419 CPU_CLR_ATOMIC(cpu, &started_cpus); 1420 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1421 1422 /* 1423 * We don't broadcast TLB invalidations to other CPUs when they are 1424 * stopped. Hence, we clear the TLB before resuming. 1425 */ 1426 invltlb_glob(); 1427 1428 #if defined(__amd64__) && defined(DDB) 1429 amd64_db_resume_dbreg(); 1430 #endif 1431 1432 if (cpu == 0 && cpustop_restartfunc != NULL) { 1433 cpustop_restartfunc(); 1434 cpustop_restartfunc = NULL; 1435 } 1436 } 1437 1438 /* 1439 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1440 * are resumed. 1441 */ 1442 void 1443 cpususpend_handler(void) 1444 { 1445 u_int cpu; 1446 1447 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1448 1449 cpu = PCPU_GET(cpuid); 1450 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1451 #ifdef __amd64__ 1452 fpususpend(susppcbs[cpu]->sp_fpususpend); 1453 #else 1454 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1455 #endif 1456 /* 1457 * suspended_cpus is cleared shortly after each AP is restarted 1458 * by a Startup IPI, so that the BSP can proceed to restarting 1459 * the next AP. 1460 * 1461 * resuming_cpus gets cleared when the AP completes 1462 * initialization after having been released by the BSP. 1463 * resuming_cpus is probably not the best name for the 1464 * variable, because it is actually a set of processors that 1465 * haven't resumed yet and haven't necessarily started resuming. 1466 * 1467 * Note that suspended_cpus is meaningful only for ACPI suspend 1468 * as it's not really used for Xen suspend since the APs are 1469 * automatically restored to the running state and the correct 1470 * context. For the same reason resumectx is never called in 1471 * that case. 1472 */ 1473 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1474 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1475 1476 /* 1477 * Invalidate the cache after setting the global status bits. 1478 * The last AP to set its bit may end up being an Owner of the 1479 * corresponding cache line in MOESI protocol. The AP may be 1480 * stopped before the cache line is written to the main memory. 1481 */ 1482 wbinvd(); 1483 } else { 1484 #ifdef __amd64__ 1485 fpuresume(susppcbs[cpu]->sp_fpususpend); 1486 #else 1487 npxresume(susppcbs[cpu]->sp_fpususpend); 1488 #endif 1489 pmap_init_pat(); 1490 initializecpu(); 1491 PCPU_SET(switchtime, 0); 1492 PCPU_SET(switchticks, ticks); 1493 1494 /* Indicate that we have restarted and restored the context. */ 1495 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1496 } 1497 1498 /* Wait for resume directive */ 1499 while (!CPU_ISSET(cpu, &toresume_cpus)) 1500 ia32_pause(); 1501 1502 /* Re-apply microcode updates. */ 1503 ucode_reload(); 1504 1505 #ifdef __i386__ 1506 /* Finish removing the identity mapping of low memory for this AP. */ 1507 invltlb_glob(); 1508 #endif 1509 1510 if (cpu_ops.cpu_resume) 1511 cpu_ops.cpu_resume(); 1512 #ifdef __amd64__ 1513 if (vmm_resume_p) 1514 vmm_resume_p(); 1515 #endif 1516 1517 /* Resume MCA and local APIC */ 1518 lapic_xapic_mode(); 1519 mca_resume(); 1520 lapic_setup(0); 1521 1522 /* Indicate that we are resumed */ 1523 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1524 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1525 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1526 } 1527 1528 1529 void 1530 invlcache_handler(void) 1531 { 1532 uint32_t generation; 1533 1534 #ifdef COUNT_IPIS 1535 (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; 1536 #endif /* COUNT_IPIS */ 1537 1538 /* 1539 * Reading the generation here allows greater parallelism 1540 * since wbinvd is a serializing instruction. Without the 1541 * temporary, we'd wait for wbinvd to complete, then the read 1542 * would execute, then the dependent write, which must then 1543 * complete before return from interrupt. 1544 */ 1545 generation = smp_tlb_generation; 1546 wbinvd(); 1547 PCPU_SET(smp_tlb_done, generation); 1548 } 1549 1550 /* 1551 * This is called once the rest of the system is up and running and we're 1552 * ready to let the AP's out of the pen. 1553 */ 1554 static void 1555 release_aps(void *dummy __unused) 1556 { 1557 1558 if (mp_ncpus == 1) 1559 return; 1560 atomic_store_rel_int(&aps_ready, 1); 1561 while (smp_started == 0) 1562 ia32_pause(); 1563 } 1564 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1565 1566 #ifdef COUNT_IPIS 1567 /* 1568 * Setup interrupt counters for IPI handlers. 1569 */ 1570 static void 1571 mp_ipi_intrcnt(void *dummy) 1572 { 1573 char buf[64]; 1574 int i; 1575 1576 CPU_FOREACH(i) { 1577 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1578 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1579 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1580 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1581 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1582 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1583 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1584 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1585 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1586 intrcnt_add(buf, &ipi_preempt_counts[i]); 1587 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1588 intrcnt_add(buf, &ipi_ast_counts[i]); 1589 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1590 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1591 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1592 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1593 } 1594 } 1595 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1596 #endif 1597 1598 /* 1599 * Flush the TLB on other CPU's 1600 */ 1601 1602 /* Variables needed for SMP tlb shootdown. */ 1603 vm_offset_t smp_tlb_addr1, smp_tlb_addr2; 1604 pmap_t smp_tlb_pmap; 1605 volatile uint32_t smp_tlb_generation; 1606 1607 #ifdef __amd64__ 1608 #define read_eflags() read_rflags() 1609 #endif 1610 1611 static void 1612 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, 1613 vm_offset_t addr1, vm_offset_t addr2) 1614 { 1615 cpuset_t other_cpus; 1616 volatile uint32_t *p_cpudone; 1617 uint32_t generation; 1618 int cpu; 1619 1620 /* It is not necessary to signal other CPUs while in the debugger. */ 1621 if (kdb_active || panicstr != NULL) 1622 return; 1623 1624 /* 1625 * Check for other cpus. Return if none. 1626 */ 1627 if (CPU_ISFULLSET(&mask)) { 1628 if (mp_ncpus <= 1) 1629 return; 1630 } else { 1631 CPU_CLR(PCPU_GET(cpuid), &mask); 1632 if (CPU_EMPTY(&mask)) 1633 return; 1634 } 1635 1636 if (!(read_eflags() & PSL_I)) 1637 panic("%s: interrupts disabled", __func__); 1638 mtx_lock_spin(&smp_ipi_mtx); 1639 smp_tlb_addr1 = addr1; 1640 smp_tlb_addr2 = addr2; 1641 smp_tlb_pmap = pmap; 1642 generation = ++smp_tlb_generation; 1643 if (CPU_ISFULLSET(&mask)) { 1644 ipi_all_but_self(vector); 1645 other_cpus = all_cpus; 1646 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1647 } else { 1648 other_cpus = mask; 1649 while ((cpu = CPU_FFS(&mask)) != 0) { 1650 cpu--; 1651 CPU_CLR(cpu, &mask); 1652 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, 1653 cpu, vector); 1654 ipi_send_cpu(cpu, vector); 1655 } 1656 } 1657 while ((cpu = CPU_FFS(&other_cpus)) != 0) { 1658 cpu--; 1659 CPU_CLR(cpu, &other_cpus); 1660 p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; 1661 while (*p_cpudone != generation) 1662 ia32_pause(); 1663 } 1664 mtx_unlock_spin(&smp_ipi_mtx); 1665 } 1666 1667 void 1668 smp_masked_invltlb(cpuset_t mask, pmap_t pmap) 1669 { 1670 1671 if (smp_started) { 1672 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); 1673 #ifdef COUNT_XINVLTLB_HITS 1674 ipi_global++; 1675 #endif 1676 } 1677 } 1678 1679 void 1680 smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap) 1681 { 1682 1683 if (smp_started) { 1684 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); 1685 #ifdef COUNT_XINVLTLB_HITS 1686 ipi_page++; 1687 #endif 1688 } 1689 } 1690 1691 void 1692 smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2, 1693 pmap_t pmap) 1694 { 1695 1696 if (smp_started) { 1697 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, 1698 addr1, addr2); 1699 #ifdef COUNT_XINVLTLB_HITS 1700 ipi_range++; 1701 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1702 #endif 1703 } 1704 } 1705 1706 void 1707 smp_cache_flush(void) 1708 { 1709 1710 if (smp_started) { 1711 smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 1712 0, 0); 1713 } 1714 } 1715 1716 /* 1717 * Handlers for TLB related IPIs 1718 */ 1719 void 1720 invltlb_handler(void) 1721 { 1722 uint32_t generation; 1723 1724 #ifdef COUNT_XINVLTLB_HITS 1725 xhits_gbl[PCPU_GET(cpuid)]++; 1726 #endif /* COUNT_XINVLTLB_HITS */ 1727 #ifdef COUNT_IPIS 1728 (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; 1729 #endif /* COUNT_IPIS */ 1730 1731 /* 1732 * Reading the generation here allows greater parallelism 1733 * since invalidating the TLB is a serializing operation. 1734 */ 1735 generation = smp_tlb_generation; 1736 if (smp_tlb_pmap == kernel_pmap) 1737 invltlb_glob(); 1738 #ifdef __amd64__ 1739 else 1740 invltlb(); 1741 #endif 1742 PCPU_SET(smp_tlb_done, generation); 1743 } 1744 1745 void 1746 invlpg_handler(void) 1747 { 1748 uint32_t generation; 1749 1750 #ifdef COUNT_XINVLTLB_HITS 1751 xhits_pg[PCPU_GET(cpuid)]++; 1752 #endif /* COUNT_XINVLTLB_HITS */ 1753 #ifdef COUNT_IPIS 1754 (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; 1755 #endif /* COUNT_IPIS */ 1756 1757 generation = smp_tlb_generation; /* Overlap with serialization */ 1758 #ifdef __i386__ 1759 if (smp_tlb_pmap == kernel_pmap) 1760 #endif 1761 invlpg(smp_tlb_addr1); 1762 PCPU_SET(smp_tlb_done, generation); 1763 } 1764 1765 void 1766 invlrng_handler(void) 1767 { 1768 vm_offset_t addr, addr2; 1769 uint32_t generation; 1770 1771 #ifdef COUNT_XINVLTLB_HITS 1772 xhits_rng[PCPU_GET(cpuid)]++; 1773 #endif /* COUNT_XINVLTLB_HITS */ 1774 #ifdef COUNT_IPIS 1775 (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; 1776 #endif /* COUNT_IPIS */ 1777 1778 addr = smp_tlb_addr1; 1779 addr2 = smp_tlb_addr2; 1780 generation = smp_tlb_generation; /* Overlap with serialization */ 1781 #ifdef __i386__ 1782 if (smp_tlb_pmap == kernel_pmap) 1783 #endif 1784 do { 1785 invlpg(addr); 1786 addr += PAGE_SIZE; 1787 } while (addr < addr2); 1788 1789 PCPU_SET(smp_tlb_done, generation); 1790 } 1791