1 /* 2 * Copyright (c) 1996, by Steve Passe 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. The name of the developer may NOT be used to endorse or promote products 11 * derived from this software without specific prior written permission. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 * $FreeBSD: src/sys/i386/i386/mp_machdep.c,v 1.115.2.15 2003/03/14 21:22:35 jhb Exp $ 26 */ 27 28 #include "opt_cpu.h" 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/kernel.h> 33 #include <sys/sysctl.h> 34 #include <sys/malloc.h> 35 #include <sys/memrange.h> 36 #include <sys/cons.h> /* cngetc() */ 37 #include <sys/machintr.h> 38 #include <sys/cpu_topology.h> 39 40 #include <sys/mplock2.h> 41 42 #include <vm/vm.h> 43 #include <vm/vm_param.h> 44 #include <vm/pmap.h> 45 #include <vm/vm_kern.h> 46 #include <vm/vm_extern.h> 47 #include <sys/lock.h> 48 #include <vm/vm_map.h> 49 #include <sys/user.h> 50 #ifdef GPROF 51 #include <sys/gmon.h> 52 #endif 53 54 #include <machine/smp.h> 55 #include <machine_base/apic/apicreg.h> 56 #include <machine/atomic.h> 57 #include <machine/cpufunc.h> 58 #include <machine/cputypes.h> 59 #include <machine_base/apic/lapic.h> 60 #include <machine_base/apic/ioapic.h> 61 #include <machine_base/acpica/acpi_md_cpu.h> 62 #include <machine/psl.h> 63 #include <machine/segments.h> 64 #include <machine/tss.h> 65 #include <machine/specialreg.h> 66 #include <machine/globaldata.h> 67 #include <machine/pmap_inval.h> 68 #include <machine/clock.h> 69 70 #include <machine/md_var.h> /* setidt() */ 71 #include <machine_base/icu/icu.h> /* IPIs */ 72 #include <machine_base/icu/icu_var.h> 73 #include <machine_base/apic/ioapic_abi.h> 74 #include <machine/intr_machdep.h> /* IPIs */ 75 76 #define WARMBOOT_TARGET 0 77 #define WARMBOOT_OFF (KERNBASE + 0x0467) 78 #define WARMBOOT_SEG (KERNBASE + 0x0469) 79 80 #define CMOS_REG (0x70) 81 #define CMOS_DATA (0x71) 82 #define BIOS_RESET (0x0f) 83 #define BIOS_WARM (0x0a) 84 85 /* 86 * this code MUST be enabled here and in mpboot.s. 87 * it follows the very early stages of AP boot by placing values in CMOS ram. 88 * it NORMALLY will never be needed and thus the primitive method for enabling. 89 * 90 */ 91 #if defined(CHECK_POINTS) 92 #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) 93 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) 94 95 #define CHECK_INIT(D); \ 96 CHECK_WRITE(0x34, (D)); \ 97 CHECK_WRITE(0x35, (D)); \ 98 CHECK_WRITE(0x36, (D)); \ 99 CHECK_WRITE(0x37, (D)); \ 100 CHECK_WRITE(0x38, (D)); \ 101 CHECK_WRITE(0x39, (D)); 102 103 #define CHECK_PRINT(S); \ 104 kprintf("%s: %d, %d, %d, %d, %d, %d\n", \ 105 (S), \ 106 CHECK_READ(0x34), \ 107 CHECK_READ(0x35), \ 108 CHECK_READ(0x36), \ 109 CHECK_READ(0x37), \ 110 CHECK_READ(0x38), \ 111 CHECK_READ(0x39)); 112 113 #else /* CHECK_POINTS */ 114 115 #define CHECK_INIT(D) 116 #define CHECK_PRINT(S) 117 118 #endif /* CHECK_POINTS */ 119 120 /* 121 * Values to send to the POST hardware. 122 */ 123 #define MP_BOOTADDRESS_POST 0x10 124 #define MP_PROBE_POST 0x11 125 #define MPTABLE_PASS1_POST 0x12 126 127 #define MP_START_POST 0x13 128 #define MP_ENABLE_POST 0x14 129 #define MPTABLE_PASS2_POST 0x15 130 131 #define START_ALL_APS_POST 0x16 132 #define INSTALL_AP_TRAMP_POST 0x17 133 #define START_AP_POST 0x18 134 135 #define MP_ANNOUNCE_POST 0x19 136 137 /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ 138 int current_postcode; 139 140 /** XXX FIXME: what system files declare these??? */ 141 142 extern int naps; 143 144 int64_t tsc0_offset; 145 extern int64_t tsc_offsets[]; 146 147 /* AP uses this during bootstrap. Do not staticize. */ 148 char *bootSTK; 149 static int bootAP; 150 151 struct pcb stoppcbs[MAXCPU]; 152 153 extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 154 155 /* 156 * Local data and functions. 157 */ 158 159 static u_int boot_address; 160 static int mp_finish; 161 static int mp_finish_lapic; 162 163 static int start_all_aps(u_int boot_addr); 164 #if 0 165 static void install_ap_tramp(u_int boot_addr); 166 #endif 167 static int start_ap(struct mdglobaldata *gd, u_int boot_addr, int smibest); 168 static int smitest(void); 169 static void mp_bsp_simple_setup(void); 170 171 /* which cpus have been started */ 172 static cpumask_t smp_startup_mask = CPUMASK_INITIALIZER_ONLYONE; 173 /* which cpus have lapic been inited */ 174 static cpumask_t smp_lapic_mask = CPUMASK_INITIALIZER_ONLYONE; 175 /* which cpus are ready for IPIs etc? */ 176 cpumask_t smp_active_mask = CPUMASK_INITIALIZER_ONLYONE; 177 cpumask_t smp_finalize_mask = CPUMASK_INITIALIZER_ONLYONE; 178 179 SYSCTL_OPAQUE(_machdep, OID_AUTO, smp_active, CTLFLAG_RD, 180 &smp_active_mask, sizeof(smp_active_mask), "LU", ""); 181 static u_int bootMP_size; 182 static u_int report_invlpg_src; 183 SYSCTL_INT(_machdep, OID_AUTO, report_invlpg_src, CTLFLAG_RW, 184 &report_invlpg_src, 0, ""); 185 static u_int report_invltlb_src; 186 SYSCTL_INT(_machdep, OID_AUTO, report_invltlb_src, CTLFLAG_RW, 187 &report_invltlb_src, 0, ""); 188 static int optimized_invltlb; 189 SYSCTL_INT(_machdep, OID_AUTO, optimized_invltlb, CTLFLAG_RW, 190 &optimized_invltlb, 0, ""); 191 static int all_but_self_ipi_enable = 1; 192 SYSCTL_INT(_machdep, OID_AUTO, all_but_self_ipi_enable, CTLFLAG_RW, 193 &all_but_self_ipi_enable, 0, ""); 194 195 /* Local data for detecting CPU TOPOLOGY */ 196 static int core_bits = 0; 197 static int logical_CPU_bits = 0; 198 199 200 /* 201 * Calculate usable address in base memory for AP trampoline code. 202 */ 203 u_int 204 mp_bootaddress(u_int basemem) 205 { 206 POSTCODE(MP_BOOTADDRESS_POST); 207 208 bootMP_size = mptramp_end - mptramp_start; 209 boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ 210 if (((basemem * 1024) - boot_address) < bootMP_size) 211 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ 212 /* 3 levels of page table pages */ 213 mptramp_pagetables = boot_address - (PAGE_SIZE * 3); 214 215 return mptramp_pagetables; 216 } 217 218 /* 219 * Print various information about the SMP system hardware and setup. 220 */ 221 void 222 mp_announce(void) 223 { 224 int x; 225 226 POSTCODE(MP_ANNOUNCE_POST); 227 228 kprintf("DragonFly/MP: Multiprocessor motherboard\n"); 229 kprintf(" cpu0 (BSP): apic id: %2d\n", CPUID_TO_APICID(0)); 230 for (x = 1; x <= naps; ++x) 231 kprintf(" cpu%d (AP): apic id: %2d\n", x, CPUID_TO_APICID(x)); 232 233 if (!ioapic_enable) 234 kprintf(" Warning: APIC I/O disabled\n"); 235 } 236 237 /* 238 * AP cpu's call this to sync up protected mode. 239 * 240 * WARNING! %gs is not set up on entry. This routine sets up %gs. 241 */ 242 void 243 init_secondary(void) 244 { 245 int gsel_tss; 246 int x, myid = bootAP; 247 u_int64_t msr, cr0; 248 struct mdglobaldata *md; 249 struct privatespace *ps; 250 251 ps = CPU_prvspace[myid]; 252 253 gdt_segs[GPROC0_SEL].ssd_base = (long)&ps->common_tss; 254 ps->mdglobaldata.mi.gd_prvspace = ps; 255 256 /* We fill the 32-bit segment descriptors */ 257 for (x = 0; x < NGDT; x++) { 258 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 259 ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x]); 260 } 261 /* And now a 64-bit one */ 262 ssdtosyssd(&gdt_segs[GPROC0_SEL], 263 (struct system_segment_descriptor *)&gdt[myid * NGDT + GPROC0_SEL]); 264 265 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 266 r_gdt.rd_base = (long) &gdt[myid * NGDT]; 267 lgdt(&r_gdt); /* does magic intra-segment return */ 268 269 /* lgdt() destroys the GSBASE value, so we load GSBASE after lgdt() */ 270 wrmsr(MSR_FSBASE, 0); /* User value */ 271 wrmsr(MSR_GSBASE, (u_int64_t)ps); 272 wrmsr(MSR_KGSBASE, 0); /* XXX User value while we're in the kernel */ 273 274 lidt(&r_idt_arr[mdcpu->mi.gd_cpuid]); 275 276 #if 0 277 lldt(_default_ldt); 278 mdcpu->gd_currentldt = _default_ldt; 279 #endif 280 281 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 282 gdt[myid * NGDT + GPROC0_SEL].sd_type = SDT_SYSTSS; 283 284 md = mdcpu; /* loaded through %gs:0 (mdglobaldata.mi.gd_prvspace)*/ 285 286 /* 287 * TSS entry point for interrupts, traps, and exceptions 288 * (sans NMI). This will always go to near the top of the pcpu 289 * trampoline area. Hardware-pushed data will be copied into 290 * the trap-frame on entry, and (if necessary) returned to the 291 * trampoline on exit. 292 * 293 * We store some pcb data for the trampoline code above the 294 * stack the cpu hw pushes into, and arrange things so the 295 * address of tr_pcb_rsp is the same as the desired top of 296 * stack. 297 */ 298 ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp; 299 ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0; 300 301 #if 0 /* JG XXX */ 302 ps->common_tss.tss_ioopt = (sizeof ps->common_tss) << 16; 303 #endif 304 md->gd_tss_gdt = &gdt[myid * NGDT + GPROC0_SEL]; 305 md->gd_common_tssd = *md->gd_tss_gdt; 306 307 /* double fault stack */ 308 ps->common_tss.tss_ist1 = (register_t)ps->dblstack + 309 sizeof(ps->dblstack); 310 311 ltr(gsel_tss); 312 313 /* 314 * Set to a known state: 315 * Set by mpboot.s: CR0_PG, CR0_PE 316 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 317 */ 318 cr0 = rcr0(); 319 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); 320 load_cr0(cr0); 321 322 /* Set up the fast syscall stuff */ 323 msr = rdmsr(MSR_EFER) | EFER_SCE; 324 wrmsr(MSR_EFER, msr); 325 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 326 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 327 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 328 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 329 wrmsr(MSR_STAR, msr); 330 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL); 331 332 pmap_set_opt(); /* PSE/4MB pages, etc */ 333 pmap_init_pat(); /* Page Attribute Table */ 334 335 /* set up CPU registers and state */ 336 cpu_setregs(); 337 338 /* set up SSE/NX registers */ 339 initializecpu(myid); 340 341 /* set up FPU state on the AP */ 342 npxinit(); 343 344 /* disable the APIC, just to be SURE */ 345 lapic->svr &= ~APIC_SVR_ENABLE; 346 } 347 348 /******************************************************************* 349 * local functions and data 350 */ 351 352 /* 353 * Start the SMP system 354 */ 355 static void 356 mp_start_aps(void *dummy __unused) 357 { 358 if (lapic_enable) { 359 /* start each Application Processor */ 360 start_all_aps(boot_address); 361 } else { 362 mp_bsp_simple_setup(); 363 } 364 } 365 SYSINIT(startaps, SI_BOOT2_START_APS, SI_ORDER_FIRST, mp_start_aps, NULL); 366 367 /* 368 * start each AP in our list 369 */ 370 static int 371 start_all_aps(u_int boot_addr) 372 { 373 vm_offset_t va = boot_address + KERNBASE; 374 u_int64_t *pt4, *pt3, *pt2; 375 int pssize; 376 int x, i; 377 int shift; 378 int smicount; 379 int smibest; 380 int smilast; 381 u_char mpbiosreason; 382 u_long mpbioswarmvec; 383 struct mdglobaldata *gd; 384 struct privatespace *ps; 385 size_t ipiq_size; 386 387 POSTCODE(START_ALL_APS_POST); 388 389 /* install the AP 1st level boot code */ 390 pmap_kenter(va, boot_address); 391 cpu_invlpg((void *)va); /* JG XXX */ 392 bcopy(mptramp_start, (void *)va, bootMP_size); 393 394 /* Locate the page tables, they'll be below the trampoline */ 395 pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); 396 pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); 397 pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); 398 399 /* Create the initial 1GB replicated page tables */ 400 for (i = 0; i < 512; i++) { 401 /* Each slot of the level 4 pages points to the same level 3 page */ 402 pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); 403 pt4[i] |= kernel_pmap.pmap_bits[PG_V_IDX] | 404 kernel_pmap.pmap_bits[PG_RW_IDX] | 405 kernel_pmap.pmap_bits[PG_U_IDX]; 406 407 /* Each slot of the level 3 pages points to the same level 2 page */ 408 pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); 409 pt3[i] |= kernel_pmap.pmap_bits[PG_V_IDX] | 410 kernel_pmap.pmap_bits[PG_RW_IDX] | 411 kernel_pmap.pmap_bits[PG_U_IDX]; 412 413 /* The level 2 page slots are mapped with 2MB pages for 1GB. */ 414 pt2[i] = i * (2 * 1024 * 1024); 415 pt2[i] |= kernel_pmap.pmap_bits[PG_V_IDX] | 416 kernel_pmap.pmap_bits[PG_RW_IDX] | 417 kernel_pmap.pmap_bits[PG_PS_IDX] | 418 kernel_pmap.pmap_bits[PG_U_IDX]; 419 } 420 421 /* save the current value of the warm-start vector */ 422 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); 423 outb(CMOS_REG, BIOS_RESET); 424 mpbiosreason = inb(CMOS_DATA); 425 426 /* setup a vector to our boot code */ 427 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; 428 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); 429 outb(CMOS_REG, BIOS_RESET); 430 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ 431 432 /* 433 * If we have a TSC we can figure out the SMI interrupt rate. 434 * The SMI does not necessarily use a constant rate. Spend 435 * up to 250ms trying to figure it out. 436 */ 437 smibest = 0; 438 if (cpu_feature & CPUID_TSC) { 439 set_apic_timer(275000); 440 smilast = read_apic_timer(); 441 for (x = 0; x < 20 && read_apic_timer(); ++x) { 442 smicount = smitest(); 443 if (smibest == 0 || smilast - smicount < smibest) 444 smibest = smilast - smicount; 445 smilast = smicount; 446 } 447 if (smibest > 250000) 448 smibest = 0; 449 } 450 if (smibest) 451 kprintf("SMI Frequency (worst case): %d Hz (%d us)\n", 452 1000000 / smibest, smibest); 453 454 /* start each AP */ 455 for (x = 1; x <= naps; ++x) { 456 /* This is a bit verbose, it will go away soon. */ 457 458 pssize = sizeof(struct privatespace); 459 ps = (void *)kmem_alloc3(&kernel_map, pssize, VM_SUBSYS_GD, 460 KM_CPU(x)); 461 CPU_prvspace[x] = ps; 462 #if 0 463 kprintf("ps %d %p %d\n", x, ps, pssize); 464 #endif 465 bzero(ps, pssize); 466 gd = &ps->mdglobaldata; 467 gd->mi.gd_prvspace = ps; 468 469 /* prime data page for it to use */ 470 mi_gdinit(&gd->mi, x); 471 cpu_gdinit(gd, x); 472 ipiq_size = sizeof(struct lwkt_ipiq) * (naps + 1); 473 gd->mi.gd_ipiq = (void *)kmem_alloc3(&kernel_map, ipiq_size, 474 VM_SUBSYS_IPIQ, KM_CPU(x)); 475 bzero(gd->mi.gd_ipiq, ipiq_size); 476 477 gd->gd_acpi_id = CPUID_TO_ACPIID(gd->mi.gd_cpuid); 478 479 /* initialize arc4random. */ 480 arc4_init_pcpu(x); 481 482 /* setup a vector to our boot code */ 483 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; 484 *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4); 485 outb(CMOS_REG, BIOS_RESET); 486 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ 487 488 /* 489 * Setup the AP boot stack 490 */ 491 bootSTK = &ps->idlestack[UPAGES * PAGE_SIZE - PAGE_SIZE]; 492 bootAP = x; 493 494 /* attempt to start the Application Processor */ 495 CHECK_INIT(99); /* setup checkpoints */ 496 if (!start_ap(gd, boot_addr, smibest)) { 497 kprintf("\nAP #%d (PHY# %d) failed!\n", 498 x, CPUID_TO_APICID(x)); 499 CHECK_PRINT("trace"); /* show checkpoints */ 500 /* better panic as the AP may be running loose */ 501 kprintf("panic y/n? [y] "); 502 cnpoll(TRUE); 503 if (cngetc() != 'n') 504 panic("bye-bye"); 505 cnpoll(FALSE); 506 } 507 CHECK_PRINT("trace"); /* show checkpoints */ 508 } 509 510 /* set ncpus to 1 + highest logical cpu. Not all may have come up */ 511 ncpus = x; 512 513 for (shift = 0; (1 << shift) <= ncpus; ++shift) 514 ; 515 --shift; 516 517 /* ncpus_fit -- ncpus rounded up to the nearest power of 2 */ 518 if ((1 << shift) < ncpus) 519 ++shift; 520 ncpus_fit = 1 << shift; 521 ncpus_fit_mask = ncpus_fit - 1; 522 523 /* build our map of 'other' CPUs */ 524 mycpu->gd_other_cpus = smp_startup_mask; 525 CPUMASK_NANDBIT(mycpu->gd_other_cpus, mycpu->gd_cpuid); 526 527 gd = (struct mdglobaldata *)mycpu; 528 gd->gd_acpi_id = CPUID_TO_ACPIID(mycpu->gd_cpuid); 529 530 ipiq_size = sizeof(struct lwkt_ipiq) * ncpus; 531 mycpu->gd_ipiq = (void *)kmem_alloc3(&kernel_map, ipiq_size, 532 VM_SUBSYS_IPIQ, KM_CPU(0)); 533 bzero(mycpu->gd_ipiq, ipiq_size); 534 535 /* initialize arc4random. */ 536 arc4_init_pcpu(0); 537 538 /* restore the warmstart vector */ 539 *(u_long *) WARMBOOT_OFF = mpbioswarmvec; 540 outb(CMOS_REG, BIOS_RESET); 541 outb(CMOS_DATA, mpbiosreason); 542 543 /* 544 * NOTE! The idlestack for the BSP was setup by locore. Finish 545 * up, clean out the P==V mapping we did earlier. 546 */ 547 pmap_set_opt(); 548 549 /* 550 * Wait all APs to finish initializing LAPIC 551 */ 552 if (bootverbose) 553 kprintf("SMP: Waiting APs LAPIC initialization\n"); 554 if (cpu_feature & CPUID_TSC) 555 tsc0_offset = rdtsc(); 556 tsc_offsets[0] = 0; 557 mp_finish_lapic = 1; 558 rel_mplock(); 559 560 while (CPUMASK_CMPMASKNEQ(smp_lapic_mask, smp_startup_mask)) { 561 cpu_pause(); 562 cpu_lfence(); 563 if (cpu_feature & CPUID_TSC) 564 tsc0_offset = rdtsc(); 565 } 566 while (try_mplock() == 0) { 567 cpu_pause(); 568 cpu_lfence(); 569 } 570 571 /* number of APs actually started */ 572 return ncpus - 1; 573 } 574 575 576 /* 577 * load the 1st level AP boot code into base memory. 578 */ 579 580 /* targets for relocation */ 581 extern void bigJump(void); 582 extern void bootCodeSeg(void); 583 extern void bootDataSeg(void); 584 extern void MPentry(void); 585 extern u_int MP_GDT; 586 extern u_int mp_gdtbase; 587 588 #if 0 589 590 static void 591 install_ap_tramp(u_int boot_addr) 592 { 593 int x; 594 int size = *(int *) ((u_long) & bootMP_size); 595 u_char *src = (u_char *) ((u_long) bootMP); 596 u_char *dst = (u_char *) boot_addr + KERNBASE; 597 u_int boot_base = (u_int) bootMP; 598 u_int8_t *dst8; 599 u_int16_t *dst16; 600 u_int32_t *dst32; 601 602 POSTCODE(INSTALL_AP_TRAMP_POST); 603 604 for (x = 0; x < size; ++x) 605 *dst++ = *src++; 606 607 /* 608 * modify addresses in code we just moved to basemem. unfortunately we 609 * need fairly detailed info about mpboot.s for this to work. changes 610 * to mpboot.s might require changes here. 611 */ 612 613 /* boot code is located in KERNEL space */ 614 dst = (u_char *) boot_addr + KERNBASE; 615 616 /* modify the lgdt arg */ 617 dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); 618 *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base); 619 620 /* modify the ljmp target for MPentry() */ 621 dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); 622 *dst32 = ((u_int) MPentry - KERNBASE); 623 624 /* modify the target for boot code segment */ 625 dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); 626 dst8 = (u_int8_t *) (dst16 + 1); 627 *dst16 = (u_int) boot_addr & 0xffff; 628 *dst8 = ((u_int) boot_addr >> 16) & 0xff; 629 630 /* modify the target for boot data segment */ 631 dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); 632 dst8 = (u_int8_t *) (dst16 + 1); 633 *dst16 = (u_int) boot_addr & 0xffff; 634 *dst8 = ((u_int) boot_addr >> 16) & 0xff; 635 } 636 637 #endif 638 639 /* 640 * This function starts the AP (application processor) identified 641 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 642 * to accomplish this. This is necessary because of the nuances 643 * of the different hardware we might encounter. It ain't pretty, 644 * but it seems to work. 645 * 646 * NOTE: eventually an AP gets to ap_init(), which is called just 647 * before the AP goes into the LWKT scheduler's idle loop. 648 */ 649 static int 650 start_ap(struct mdglobaldata *gd, u_int boot_addr, int smibest) 651 { 652 int physical_cpu; 653 int vector; 654 u_long icr_lo, icr_hi; 655 656 POSTCODE(START_AP_POST); 657 658 /* get the PHYSICAL APIC ID# */ 659 physical_cpu = CPUID_TO_APICID(gd->mi.gd_cpuid); 660 661 /* calculate the vector */ 662 vector = (boot_addr >> 12) & 0xff; 663 664 /* We don't want anything interfering */ 665 cpu_disable_intr(); 666 667 /* Make sure the target cpu sees everything */ 668 wbinvd(); 669 670 /* 671 * Try to detect when a SMI has occurred, wait up to 200ms. 672 * 673 * If a SMI occurs during an AP reset but before we issue 674 * the STARTUP command, the AP may brick. To work around 675 * this problem we hold off doing the AP startup until 676 * after we have detected the SMI. Hopefully another SMI 677 * will not occur before we finish the AP startup. 678 * 679 * Retries don't seem to help. SMIs have a window of opportunity 680 * and if USB->legacy keyboard emulation is enabled in the BIOS 681 * the interrupt rate can be quite high. 682 * 683 * NOTE: Don't worry about the L1 cache load, it might bloat 684 * ldelta a little but ndelta will be so huge when the SMI 685 * occurs the detection logic will still work fine. 686 */ 687 if (smibest) { 688 set_apic_timer(200000); 689 smitest(); 690 } 691 692 /* 693 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting 694 * and running the target CPU. OR this INIT IPI might be latched (P5 695 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 696 * ignored. 697 * 698 * see apic/apicreg.h for icr bit definitions. 699 * 700 * TIME CRITICAL CODE, DO NOT DO ANY KPRINTFS IN THE HOT PATH. 701 */ 702 703 /* 704 * Setup the address for the target AP. We can setup 705 * icr_hi once and then just trigger operations with 706 * icr_lo. 707 */ 708 icr_hi = lapic->icr_hi & ~APIC_ID_MASK; 709 icr_hi |= (physical_cpu << 24); 710 icr_lo = lapic->icr_lo & 0xfff00000; 711 lapic->icr_hi = icr_hi; 712 713 /* 714 * Do an INIT IPI: assert RESET 715 * 716 * Use edge triggered mode to assert INIT 717 */ 718 lapic->icr_lo = icr_lo | 0x00004500; 719 while (lapic->icr_lo & APIC_DELSTAT_MASK) 720 /* spin */ ; 721 722 /* 723 * The spec calls for a 10ms delay but we may have to use a 724 * MUCH lower delay to avoid bricking an AP due to a fast SMI 725 * interrupt. We have other loops here too and dividing by 2 726 * doesn't seem to be enough even after subtracting 350us, 727 * so we divide by 4. 728 * 729 * Our minimum delay is 150uS, maximum is 10ms. If no SMI 730 * interrupt was detected we use the full 10ms. 731 */ 732 if (smibest == 0) 733 u_sleep(10000); 734 else if (smibest < 150 * 4 + 350) 735 u_sleep(150); 736 else if ((smibest - 350) / 4 < 10000) 737 u_sleep((smibest - 350) / 4); 738 else 739 u_sleep(10000); 740 741 /* 742 * Do an INIT IPI: deassert RESET 743 * 744 * Use level triggered mode to deassert. It is unclear 745 * why we need to do this. 746 */ 747 lapic->icr_lo = icr_lo | 0x00008500; 748 while (lapic->icr_lo & APIC_DELSTAT_MASK) 749 /* spin */ ; 750 u_sleep(150); /* wait 150us */ 751 752 /* 753 * Next we do a STARTUP IPI: the previous INIT IPI might still be 754 * latched, (P5 bug) this 1st STARTUP would then terminate 755 * immediately, and the previously started INIT IPI would continue. OR 756 * the previous INIT IPI has already run. and this STARTUP IPI will 757 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 758 * will run. 759 */ 760 lapic->icr_lo = icr_lo | 0x00000600 | vector; 761 while (lapic->icr_lo & APIC_DELSTAT_MASK) 762 /* spin */ ; 763 u_sleep(200); /* wait ~200uS */ 764 765 /* 766 * Finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 767 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 768 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 769 * recognized after hardware RESET or INIT IPI. 770 */ 771 lapic->icr_lo = icr_lo | 0x00000600 | vector; 772 while (lapic->icr_lo & APIC_DELSTAT_MASK) 773 /* spin */ ; 774 775 /* Resume normal operation */ 776 cpu_enable_intr(); 777 778 /* wait for it to start, see ap_init() */ 779 set_apic_timer(5000000);/* == 5 seconds */ 780 while (read_apic_timer()) { 781 if (CPUMASK_TESTBIT(smp_startup_mask, gd->mi.gd_cpuid)) 782 return 1; /* return SUCCESS */ 783 } 784 785 return 0; /* return FAILURE */ 786 } 787 788 static 789 int 790 smitest(void) 791 { 792 int64_t ltsc; 793 int64_t ntsc; 794 int64_t ldelta; 795 int64_t ndelta; 796 int count; 797 798 ldelta = 0; 799 ndelta = 0; 800 while (read_apic_timer()) { 801 ltsc = rdtsc(); 802 for (count = 0; count < 100; ++count) 803 ntsc = rdtsc(); /* force loop to occur */ 804 if (ldelta) { 805 ndelta = ntsc - ltsc; 806 if (ldelta > ndelta) 807 ldelta = ndelta; 808 if (ndelta > ldelta * 2) 809 break; 810 } else { 811 ldelta = ntsc - ltsc; 812 } 813 } 814 return(read_apic_timer()); 815 } 816 817 /* 818 * Synchronously flush the TLB on all other CPU's. The current cpu's 819 * TLB is not flushed. If the caller wishes to flush the current cpu's 820 * TLB the caller must call cpu_invltlb() in addition to smp_invltlb(). 821 * 822 * This routine may be called concurrently from multiple cpus. When this 823 * happens, smp_invltlb() can wind up sticking around in the confirmation 824 * while() loop at the end as additional cpus are added to the global 825 * cpumask, until they are acknowledged by another IPI. 826 * 827 * NOTE: If for some reason we were unable to start all cpus we cannot 828 * safely use broadcast IPIs. 829 */ 830 831 cpumask_t smp_smurf_mask; 832 static cpumask_t smp_invltlb_mask; 833 #define LOOPRECOVER 834 #define LOOPMASK_IN 835 #ifdef LOOPMASK_IN 836 cpumask_t smp_in_mask; 837 #endif 838 cpumask_t smp_invmask; 839 extern cpumask_t smp_idleinvl_mask; 840 extern cpumask_t smp_idleinvl_reqs; 841 842 /* 843 * Atomically OR bits in *mask to smp_smurf_mask. Adjust *mask to remove 844 * bits that do not need to be IPId. These bits are still part of the command, 845 * but the target cpus have already been signalled and do not need to be 846 * sigalled again. 847 */ 848 #include <sys/spinlock.h> 849 #include <sys/spinlock2.h> 850 851 static __noinline 852 void 853 smp_smurf_fetchset(cpumask_t *mask) 854 { 855 cpumask_t omask; 856 int i; 857 __uint64_t obits; 858 __uint64_t nbits; 859 860 i = 0; 861 while (i < CPUMASK_ELEMENTS) { 862 obits = smp_smurf_mask.ary[i]; 863 cpu_ccfence(); 864 nbits = obits | mask->ary[i]; 865 if (atomic_cmpset_long(&smp_smurf_mask.ary[i], obits, nbits)) { 866 omask.ary[i] = obits; 867 ++i; 868 } 869 } 870 CPUMASK_NANDMASK(*mask, omask); 871 } 872 873 /* 874 * This is a mechanism which guarantees that cpu_invltlb() will be executed 875 * on idle cpus without having to signal or wake them up. The invltlb will be 876 * executed when they wake up, prior to any scheduling or interrupt thread. 877 * 878 * (*mask) is modified to remove the cpus we successfully negotiate this 879 * function with. This function may only be used with semi-synchronous 880 * commands (typically invltlb's or semi-synchronous invalidations which 881 * are usually associated only with kernel memory). 882 */ 883 void 884 smp_smurf_idleinvlclr(cpumask_t *mask) 885 { 886 if (optimized_invltlb) { 887 ATOMIC_CPUMASK_ORMASK(smp_idleinvl_reqs, *mask); 888 /* cpu_lfence() not needed */ 889 CPUMASK_NANDMASK(*mask, smp_idleinvl_mask); 890 } 891 } 892 893 /* 894 * Issue cpu_invltlb() across all cpus except the current cpu. 895 * 896 * This function will arrange to avoid idle cpus, but still gurantee that 897 * invltlb is run on them when they wake up prior to any scheduling or 898 * nominal interrupt. 899 */ 900 void 901 smp_invltlb(void) 902 { 903 struct mdglobaldata *md = mdcpu; 904 cpumask_t mask; 905 unsigned long rflags; 906 #ifdef LOOPRECOVER 907 tsc_uclock_t tsc_base = rdtsc(); 908 int repeats = 0; 909 #endif 910 911 if (report_invltlb_src > 0) { 912 if (--report_invltlb_src <= 0) 913 print_backtrace(8); 914 } 915 916 /* 917 * Disallow normal interrupts, set all active cpus except our own 918 * in the global smp_invltlb_mask. 919 */ 920 ++md->mi.gd_cnt.v_smpinvltlb; 921 crit_enter_gd(&md->mi); 922 923 /* 924 * Bits we want to set in smp_invltlb_mask. We do not want to signal 925 * our own cpu. Also try to remove bits associated with idle cpus 926 * that we can flag for auto-invltlb. 927 */ 928 mask = smp_active_mask; 929 CPUMASK_NANDBIT(mask, md->mi.gd_cpuid); 930 smp_smurf_idleinvlclr(&mask); 931 932 rflags = read_rflags(); 933 cpu_disable_intr(); 934 ATOMIC_CPUMASK_ORMASK(smp_invltlb_mask, mask); 935 936 /* 937 * IPI non-idle cpus represented by mask. The omask calculation 938 * removes cpus from the mask which already have a Xinvltlb IPI 939 * pending (avoid double-queueing the IPI). 940 * 941 * We must disable real interrupts when setting the smurf flags or 942 * we might race a XINVLTLB before we manage to send the ipi's for 943 * the bits we set. 944 * 945 * NOTE: We are not signalling ourselves, mask already does NOT 946 * include our own cpu. 947 */ 948 smp_smurf_fetchset(&mask); 949 950 /* 951 * Issue the IPI. Note that the XINVLTLB IPI runs regardless of 952 * the critical section count on the target cpus. 953 */ 954 CPUMASK_ORMASK(mask, md->mi.gd_cpumask); 955 if (all_but_self_ipi_enable && 956 (all_but_self_ipi_enable >= 2 || 957 CPUMASK_CMPMASKEQ(smp_startup_mask, mask))) { 958 all_but_self_ipi(XINVLTLB_OFFSET); 959 } else { 960 CPUMASK_NANDMASK(mask, md->mi.gd_cpumask); 961 selected_apic_ipi(mask, XINVLTLB_OFFSET, APIC_DELMODE_FIXED); 962 } 963 964 /* 965 * Wait for acknowledgement by all cpus. smp_inval_intr() will 966 * temporarily enable interrupts to avoid deadlocking the lapic, 967 * and will also handle running cpu_invltlb() and remote invlpg 968 * command son our cpu if some other cpu requests it of us. 969 * 970 * WARNING! I originally tried to implement this as a hard loop 971 * checking only smp_invltlb_mask (and issuing a local 972 * cpu_invltlb() if requested), with interrupts enabled 973 * and without calling smp_inval_intr(). This DID NOT WORK. 974 * It resulted in weird races where smurf bits would get 975 * cleared without any action being taken. 976 */ 977 smp_inval_intr(); 978 CPUMASK_ASSZERO(mask); 979 while (CPUMASK_CMPMASKNEQ(smp_invltlb_mask, mask)) { 980 smp_inval_intr(); 981 cpu_pause(); 982 #ifdef LOOPRECOVER 983 if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) { 984 /* 985 * cpuid - cpu doing the waiting 986 * invltlb_mask - IPI in progress 987 */ 988 kprintf("smp_invltlb %d: waited too long inv=%08jx " 989 "smurf=%08jx " 990 #ifdef LOOPMASK_IN 991 "in=%08jx " 992 #endif 993 "idle=%08jx/%08jx\n", 994 md->mi.gd_cpuid, 995 smp_invltlb_mask.ary[0], 996 smp_smurf_mask.ary[0], 997 #ifdef LOOPMASK_IN 998 smp_in_mask.ary[0], 999 #endif 1000 smp_idleinvl_mask.ary[0], 1001 smp_idleinvl_reqs.ary[0]); 1002 mdcpu->gd_xinvaltlb = 0; 1003 ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask, 1004 smp_invltlb_mask); 1005 smp_invlpg(&smp_active_mask); 1006 tsc_base = rdtsc(); 1007 if (++repeats > 10) { 1008 kprintf("smp_invltlb: giving up\n"); 1009 CPUMASK_ASSZERO(smp_invltlb_mask); 1010 } 1011 } 1012 #endif 1013 } 1014 write_rflags(rflags); 1015 crit_exit_gd(&md->mi); 1016 } 1017 1018 /* 1019 * Called from a critical section with interrupts hard-disabled. 1020 * This function issues an XINVLTLB IPI and then executes any pending 1021 * command on the current cpu before returning. 1022 */ 1023 void 1024 smp_invlpg(cpumask_t *cmdmask) 1025 { 1026 struct mdglobaldata *md = mdcpu; 1027 cpumask_t mask; 1028 1029 if (report_invlpg_src > 0) { 1030 if (--report_invlpg_src <= 0) 1031 print_backtrace(8); 1032 } 1033 1034 /* 1035 * Disallow normal interrupts, set all active cpus in the pmap, 1036 * plus our own for completion processing (it might or might not 1037 * be part of the set). 1038 */ 1039 mask = smp_active_mask; 1040 CPUMASK_ANDMASK(mask, *cmdmask); 1041 CPUMASK_ORMASK(mask, md->mi.gd_cpumask); 1042 1043 /* 1044 * Avoid double-queuing IPIs, which can deadlock us. We must disable 1045 * real interrupts when setting the smurf flags or we might race a 1046 * XINVLTLB before we manage to send the ipi's for the bits we set. 1047 * 1048 * NOTE: We might be including our own cpu in the smurf mask. 1049 */ 1050 smp_smurf_fetchset(&mask); 1051 1052 /* 1053 * Issue the IPI. Note that the XINVLTLB IPI runs regardless of 1054 * the critical section count on the target cpus. 1055 * 1056 * We do not include our own cpu when issuing the IPI. 1057 */ 1058 if (all_but_self_ipi_enable && 1059 (all_but_self_ipi_enable >= 2 || 1060 CPUMASK_CMPMASKEQ(smp_startup_mask, mask))) { 1061 all_but_self_ipi(XINVLTLB_OFFSET); 1062 } else { 1063 CPUMASK_NANDMASK(mask, md->mi.gd_cpumask); 1064 selected_apic_ipi(mask, XINVLTLB_OFFSET, APIC_DELMODE_FIXED); 1065 } 1066 1067 /* 1068 * This will synchronously wait for our command to complete, 1069 * as well as process commands from other cpus. It also handles 1070 * reentrancy. 1071 * 1072 * (interrupts are disabled and we are in a critical section here) 1073 */ 1074 smp_inval_intr(); 1075 } 1076 1077 void 1078 smp_sniff(void) 1079 { 1080 globaldata_t gd = mycpu; 1081 int dummy; 1082 register_t rflags; 1083 1084 /* 1085 * Ignore all_but_self_ipi_enable here and just use it. 1086 */ 1087 rflags = read_rflags(); 1088 cpu_disable_intr(); 1089 all_but_self_ipi(XSNIFF_OFFSET); 1090 gd->gd_sample_pc = smp_sniff; 1091 gd->gd_sample_sp = &dummy; 1092 write_rflags(rflags); 1093 } 1094 1095 void 1096 cpu_sniff(int dcpu) 1097 { 1098 globaldata_t rgd = globaldata_find(dcpu); 1099 register_t rflags; 1100 int dummy; 1101 1102 /* 1103 * Ignore all_but_self_ipi_enable here and just use it. 1104 */ 1105 rflags = read_rflags(); 1106 cpu_disable_intr(); 1107 single_apic_ipi(dcpu, XSNIFF_OFFSET, APIC_DELMODE_FIXED); 1108 rgd->gd_sample_pc = cpu_sniff; 1109 rgd->gd_sample_sp = &dummy; 1110 write_rflags(rflags); 1111 } 1112 1113 /* 1114 * Called from Xinvltlb assembly with interrupts hard-disabled and in a 1115 * critical section. gd_intr_nesting_level may or may not be bumped 1116 * depending on entry. 1117 * 1118 * THIS CODE IS INTENDED TO EXPLICITLY IGNORE THE CRITICAL SECTION COUNT. 1119 * THAT IS, THE INTERRUPT IS INTENDED TO FUNCTION EVEN WHEN MAINLINE CODE 1120 * IS IN A CRITICAL SECTION. 1121 */ 1122 void 1123 smp_inval_intr(void) 1124 { 1125 struct mdglobaldata *md = mdcpu; 1126 cpumask_t cpumask; 1127 #ifdef LOOPRECOVER 1128 tsc_uclock_t tsc_base = rdtsc(); 1129 #endif 1130 1131 #if 0 1132 /* 1133 * The idle code is in a critical section, but that doesn't stop 1134 * Xinvltlb from executing, so deal with the race which can occur 1135 * in that situation. Otherwise r-m-w operations by pmap_inval_intr() 1136 * may have problems. 1137 */ 1138 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, md->mi.gd_cpuid)) { 1139 ATOMIC_CPUMASK_NANDBIT(smp_invltlb_mask, md->mi.gd_cpuid); 1140 cpu_invltlb(); 1141 cpu_mfence(); 1142 } 1143 #endif 1144 1145 /* 1146 * This is a real mess. I'd like to just leave interrupts disabled 1147 * but it can cause the lapic to deadlock if too many interrupts queue 1148 * to it, due to the idiotic design of the lapic. So instead we have 1149 * to enter a critical section so normal interrupts are made pending 1150 * and track whether this one was reentered. 1151 */ 1152 if (md->gd_xinvaltlb) { /* reentrant on cpu */ 1153 md->gd_xinvaltlb = 2; 1154 return; 1155 } 1156 md->gd_xinvaltlb = 1; 1157 1158 /* 1159 * Check only those cpus with active Xinvl* commands pending. 1160 * 1161 * We are going to enable interrupts so make sure we are in a 1162 * critical section. This is necessary to avoid deadlocking 1163 * the lapic and to ensure that we execute our commands prior to 1164 * any nominal interrupt or preemption. 1165 * 1166 * WARNING! It is very important that we only clear out but in 1167 * smp_smurf_mask once for each interrupt we take. In 1168 * this case, we clear it on initial entry and only loop 1169 * on the reentrancy detect (caused by another interrupt). 1170 */ 1171 cpumask = smp_invmask; 1172 #ifdef LOOPMASK_IN 1173 ATOMIC_CPUMASK_ORBIT(smp_in_mask, md->mi.gd_cpuid); 1174 #endif 1175 loop: 1176 cpu_enable_intr(); 1177 ATOMIC_CPUMASK_NANDBIT(smp_smurf_mask, md->mi.gd_cpuid); 1178 1179 /* 1180 * Specific page request(s), and we can't return until all bits 1181 * are zero. 1182 */ 1183 for (;;) { 1184 int toolong; 1185 1186 /* 1187 * Also execute any pending full invalidation request in 1188 * this loop. 1189 */ 1190 if (CPUMASK_TESTBIT(smp_invltlb_mask, md->mi.gd_cpuid)) { 1191 ATOMIC_CPUMASK_NANDBIT(smp_invltlb_mask, 1192 md->mi.gd_cpuid); 1193 cpu_invltlb(); 1194 cpu_mfence(); 1195 } 1196 1197 #ifdef LOOPRECOVER 1198 if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) { 1199 /* 1200 * cpuid - cpu doing the waiting 1201 * invmask - IPI in progress 1202 * invltlb_mask - which ones are TLB invalidations? 1203 */ 1204 kprintf("smp_inval_intr %d inv=%08jx tlbm=%08jx " 1205 "smurf=%08jx " 1206 #ifdef LOOPMASK_IN 1207 "in=%08jx " 1208 #endif 1209 "idle=%08jx/%08jx\n", 1210 md->mi.gd_cpuid, 1211 smp_invmask.ary[0], 1212 smp_invltlb_mask.ary[0], 1213 smp_smurf_mask.ary[0], 1214 #ifdef LOOPMASK_IN 1215 smp_in_mask.ary[0], 1216 #endif 1217 smp_idleinvl_mask.ary[0], 1218 smp_idleinvl_reqs.ary[0]); 1219 tsc_base = rdtsc(); 1220 toolong = 1; 1221 } else { 1222 toolong = 0; 1223 } 1224 #else 1225 toolong = 0; 1226 #endif 1227 1228 /* 1229 * We can only add bits to the cpumask to test during the 1230 * loop because the smp_invmask bit is cleared once the 1231 * originator completes the command (the targets may still 1232 * be cycling their own completions in this loop, afterwords). 1233 * 1234 * lfence required prior to all tests as this Xinvltlb 1235 * interrupt could race the originator (already be in progress 1236 * wnen the originator decides to issue, due to an issue by 1237 * another cpu). 1238 */ 1239 cpu_lfence(); 1240 CPUMASK_ORMASK(cpumask, smp_invmask); 1241 /*cpumask = smp_active_mask;*/ /* XXX */ 1242 cpu_lfence(); 1243 1244 if (pmap_inval_intr(&cpumask, toolong) == 0) { 1245 /* 1246 * Clear our smurf mask to allow new IPIs, but deal 1247 * with potential races. 1248 */ 1249 break; 1250 } 1251 1252 /* 1253 * Test if someone sent us another invalidation IPI, break 1254 * out so we can take it to avoid deadlocking the lapic 1255 * interrupt queue (? stupid intel, amd). 1256 */ 1257 if (md->gd_xinvaltlb == 2) 1258 break; 1259 /* 1260 if (CPUMASK_TESTBIT(smp_smurf_mask, md->mi.gd_cpuid)) 1261 break; 1262 */ 1263 } 1264 1265 /* 1266 * Full invalidation request 1267 */ 1268 if (CPUMASK_TESTBIT(smp_invltlb_mask, md->mi.gd_cpuid)) { 1269 ATOMIC_CPUMASK_NANDBIT(smp_invltlb_mask, 1270 md->mi.gd_cpuid); 1271 cpu_invltlb(); 1272 cpu_mfence(); 1273 } 1274 1275 /* 1276 * Check to see if another Xinvltlb interrupt occurred and loop up 1277 * if it did. 1278 */ 1279 cpu_disable_intr(); 1280 if (md->gd_xinvaltlb == 2) { 1281 md->gd_xinvaltlb = 1; 1282 goto loop; 1283 } 1284 #ifdef LOOPMASK_IN 1285 ATOMIC_CPUMASK_NANDBIT(smp_in_mask, md->mi.gd_cpuid); 1286 #endif 1287 md->gd_xinvaltlb = 0; 1288 } 1289 1290 void 1291 cpu_wbinvd_on_all_cpus_callback(void *arg) 1292 { 1293 wbinvd(); 1294 } 1295 1296 /* 1297 * When called the executing CPU will send an IPI to all other CPUs 1298 * requesting that they halt execution. 1299 * 1300 * Usually (but not necessarily) called with 'other_cpus' as its arg. 1301 * 1302 * - Signals all CPUs in map to stop. 1303 * - Waits for each to stop. 1304 * 1305 * Returns: 1306 * -1: error 1307 * 0: NA 1308 * 1: ok 1309 * 1310 * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs 1311 * from executing at same time. 1312 */ 1313 int 1314 stop_cpus(cpumask_t map) 1315 { 1316 cpumask_t mask; 1317 1318 CPUMASK_ANDMASK(map, smp_active_mask); 1319 1320 /* send the Xcpustop IPI to all CPUs in map */ 1321 selected_apic_ipi(map, XCPUSTOP_OFFSET, APIC_DELMODE_FIXED); 1322 1323 do { 1324 mask = stopped_cpus; 1325 CPUMASK_ANDMASK(mask, map); 1326 /* spin */ 1327 } while (CPUMASK_CMPMASKNEQ(mask, map)); 1328 1329 return 1; 1330 } 1331 1332 1333 /* 1334 * Called by a CPU to restart stopped CPUs. 1335 * 1336 * Usually (but not necessarily) called with 'stopped_cpus' as its arg. 1337 * 1338 * - Signals all CPUs in map to restart. 1339 * - Waits for each to restart. 1340 * 1341 * Returns: 1342 * -1: error 1343 * 0: NA 1344 * 1: ok 1345 */ 1346 int 1347 restart_cpus(cpumask_t map) 1348 { 1349 cpumask_t mask; 1350 1351 /* signal other cpus to restart */ 1352 mask = map; 1353 CPUMASK_ANDMASK(mask, smp_active_mask); 1354 cpu_ccfence(); 1355 started_cpus = mask; 1356 cpu_ccfence(); 1357 1358 /* wait for each to clear its bit */ 1359 while (CPUMASK_CMPMASKNEQ(stopped_cpus, map)) 1360 cpu_pause(); 1361 1362 return 1; 1363 } 1364 1365 /* 1366 * This is called once the mpboot code has gotten us properly relocated 1367 * and the MMU turned on, etc. ap_init() is actually the idle thread, 1368 * and when it returns the scheduler will call the real cpu_idle() main 1369 * loop for the idlethread. Interrupts are disabled on entry and should 1370 * remain disabled at return. 1371 */ 1372 void 1373 ap_init(void) 1374 { 1375 int cpu_id; 1376 1377 /* 1378 * Adjust smp_startup_mask to signal the BSP that we have started 1379 * up successfully. Note that we do not yet hold the BGL. The BSP 1380 * is waiting for our signal. 1381 * 1382 * We can't set our bit in smp_active_mask yet because we are holding 1383 * interrupts physically disabled and remote cpus could deadlock 1384 * trying to send us an IPI. 1385 */ 1386 ATOMIC_CPUMASK_ORBIT(smp_startup_mask, mycpu->gd_cpuid); 1387 cpu_mfence(); 1388 1389 /* 1390 * Interlock for LAPIC initialization. Wait until mp_finish_lapic is 1391 * non-zero, then get the MP lock. 1392 * 1393 * Note: We are in a critical section. 1394 * 1395 * Note: we are the idle thread, we can only spin. 1396 * 1397 * Note: The load fence is memory volatile and prevents the compiler 1398 * from improperly caching mp_finish_lapic, and the cpu from improperly 1399 * caching it. 1400 */ 1401 while (mp_finish_lapic == 0) { 1402 cpu_pause(); 1403 cpu_lfence(); 1404 } 1405 #if 0 1406 while (try_mplock() == 0) { 1407 cpu_pause(); 1408 cpu_lfence(); 1409 } 1410 #endif 1411 1412 if (cpu_feature & CPUID_TSC) { 1413 /* 1414 * The BSP is constantly updating tsc0_offset, figure out 1415 * the relative difference to synchronize ktrdump. 1416 */ 1417 tsc_offsets[mycpu->gd_cpuid] = rdtsc() - tsc0_offset; 1418 } 1419 1420 /* BSP may have changed PTD while we're waiting for the lock */ 1421 cpu_invltlb(); 1422 1423 /* Build our map of 'other' CPUs. */ 1424 mycpu->gd_other_cpus = smp_startup_mask; 1425 ATOMIC_CPUMASK_NANDBIT(mycpu->gd_other_cpus, mycpu->gd_cpuid); 1426 1427 /* A quick check from sanity claus */ 1428 cpu_id = APICID_TO_CPUID((lapic->id & 0xff000000) >> 24); 1429 if (mycpu->gd_cpuid != cpu_id) { 1430 kprintf("SMP: assigned cpuid = %d\n", mycpu->gd_cpuid); 1431 kprintf("SMP: actual cpuid = %d lapicid %d\n", 1432 cpu_id, (lapic->id & 0xff000000) >> 24); 1433 #if 0 /* JGXXX */ 1434 kprintf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]); 1435 #endif 1436 panic("cpuid mismatch! boom!!"); 1437 } 1438 1439 /* Initialize AP's local APIC for irq's */ 1440 lapic_init(FALSE); 1441 1442 /* LAPIC initialization is done */ 1443 ATOMIC_CPUMASK_ORBIT(smp_lapic_mask, mycpu->gd_cpuid); 1444 cpu_mfence(); 1445 1446 #if 0 1447 /* Let BSP move onto the next initialization stage */ 1448 rel_mplock(); 1449 #endif 1450 1451 /* 1452 * Interlock for finalization. Wait until mp_finish is non-zero, 1453 * then get the MP lock. 1454 * 1455 * Note: We are in a critical section. 1456 * 1457 * Note: we are the idle thread, we can only spin. 1458 * 1459 * Note: The load fence is memory volatile and prevents the compiler 1460 * from improperly caching mp_finish, and the cpu from improperly 1461 * caching it. 1462 */ 1463 while (mp_finish == 0) { 1464 cpu_pause(); 1465 cpu_lfence(); 1466 } 1467 1468 /* BSP may have changed PTD while we're waiting for the lock */ 1469 cpu_invltlb(); 1470 1471 /* Set memory range attributes for this CPU to match the BSP */ 1472 mem_range_AP_init(); 1473 1474 /* 1475 * Once we go active we must process any IPIQ messages that may 1476 * have been queued, because no actual IPI will occur until we 1477 * set our bit in the smp_active_mask. If we don't the IPI 1478 * message interlock could be left set which would also prevent 1479 * further IPIs. 1480 * 1481 * The idle loop doesn't expect the BGL to be held and while 1482 * lwkt_switch() normally cleans things up this is a special case 1483 * because we returning almost directly into the idle loop. 1484 * 1485 * The idle thread is never placed on the runq, make sure 1486 * nothing we've done put it there. 1487 */ 1488 1489 /* 1490 * Hold a critical section and allow real interrupts to occur. Zero 1491 * any spurious interrupts which have accumulated, then set our 1492 * smp_active_mask indicating that we are fully operational. 1493 */ 1494 crit_enter(); 1495 __asm __volatile("sti; pause; pause"::); 1496 bzero(mdcpu->gd_ipending, sizeof(mdcpu->gd_ipending)); 1497 ATOMIC_CPUMASK_ORBIT(smp_active_mask, mycpu->gd_cpuid); 1498 1499 /* 1500 * Wait until all cpus have set their smp_active_mask and have fully 1501 * operational interrupts before proceeding. 1502 * 1503 * We need a final cpu_invltlb() because we would not have received 1504 * any until we set our bit in smp_active_mask. 1505 */ 1506 while (mp_finish == 1) { 1507 cpu_pause(); 1508 cpu_lfence(); 1509 } 1510 cpu_invltlb(); 1511 1512 /* 1513 * Initialize per-cpu clocks and do other per-cpu initialization. 1514 * At this point code is expected to be able to use the full kernel 1515 * API. 1516 */ 1517 initclocks_pcpu(); /* clock interrupts (via IPIs) */ 1518 1519 /* 1520 * Since we may have cleaned up the interrupt triggers, manually 1521 * process any pending IPIs before exiting our critical section. 1522 * Once the critical section has exited, normal interrupt processing 1523 * may occur. 1524 */ 1525 atomic_swap_int(&mycpu->gd_npoll, 0); 1526 lwkt_process_ipiq(); 1527 crit_exit(); 1528 1529 /* 1530 * Final final, allow the waiting BSP to resume the boot process, 1531 * return 'into' the idle thread bootstrap. 1532 */ 1533 ATOMIC_CPUMASK_ORBIT(smp_finalize_mask, mycpu->gd_cpuid); 1534 KKASSERT((curthread->td_flags & TDF_RUNQ) == 0); 1535 } 1536 1537 /* 1538 * Get SMP fully working before we start initializing devices. 1539 */ 1540 static 1541 void 1542 ap_finish(void) 1543 { 1544 if (bootverbose) 1545 kprintf("Finish MP startup\n"); 1546 rel_mplock(); 1547 1548 /* 1549 * Wait for the active mask to complete, after which all cpus will 1550 * be accepting interrupts. 1551 */ 1552 mp_finish = 1; 1553 while (CPUMASK_CMPMASKNEQ(smp_active_mask, smp_startup_mask)) { 1554 cpu_pause(); 1555 cpu_lfence(); 1556 } 1557 1558 /* 1559 * Wait for the finalization mask to complete, after which all cpus 1560 * have completely finished initializing and are entering or are in 1561 * their idle thread. 1562 * 1563 * BSP should have received all required invltlbs but do another 1564 * one just in case. 1565 */ 1566 cpu_invltlb(); 1567 mp_finish = 2; 1568 while (CPUMASK_CMPMASKNEQ(smp_finalize_mask, smp_startup_mask)) { 1569 cpu_pause(); 1570 cpu_lfence(); 1571 } 1572 1573 while (try_mplock() == 0) { 1574 cpu_pause(); 1575 cpu_lfence(); 1576 } 1577 1578 if (bootverbose) { 1579 kprintf("Active CPU Mask: %016jx\n", 1580 (uintmax_t)CPUMASK_LOWMASK(smp_active_mask)); 1581 } 1582 } 1583 1584 SYSINIT(finishsmp, SI_BOOT2_FINISH_SMP, SI_ORDER_FIRST, ap_finish, NULL); 1585 1586 /* 1587 * Interrupts must be hard-disabled by caller 1588 */ 1589 void 1590 cpu_send_ipiq(int dcpu) 1591 { 1592 if (CPUMASK_TESTBIT(smp_active_mask, dcpu)) 1593 single_apic_ipi(dcpu, XIPIQ_OFFSET, APIC_DELMODE_FIXED); 1594 } 1595 1596 #if 0 /* single_apic_ipi_passive() not working yet */ 1597 /* 1598 * Returns 0 on failure, 1 on success 1599 */ 1600 int 1601 cpu_send_ipiq_passive(int dcpu) 1602 { 1603 int r = 0; 1604 if (CPUMASK_TESTBIT(smp_active_mask, dcpu)) { 1605 r = single_apic_ipi_passive(dcpu, XIPIQ_OFFSET, 1606 APIC_DELMODE_FIXED); 1607 } 1608 return(r); 1609 } 1610 #endif 1611 1612 static void 1613 mp_bsp_simple_setup(void) 1614 { 1615 struct mdglobaldata *gd; 1616 size_t ipiq_size; 1617 1618 /* build our map of 'other' CPUs */ 1619 mycpu->gd_other_cpus = smp_startup_mask; 1620 CPUMASK_NANDBIT(mycpu->gd_other_cpus, mycpu->gd_cpuid); 1621 1622 gd = (struct mdglobaldata *)mycpu; 1623 gd->gd_acpi_id = CPUID_TO_ACPIID(mycpu->gd_cpuid); 1624 1625 ipiq_size = sizeof(struct lwkt_ipiq) * ncpus; 1626 mycpu->gd_ipiq = (void *)kmem_alloc(&kernel_map, ipiq_size, 1627 VM_SUBSYS_IPIQ); 1628 bzero(mycpu->gd_ipiq, ipiq_size); 1629 1630 /* initialize arc4random. */ 1631 arc4_init_pcpu(0); 1632 1633 pmap_set_opt(); 1634 1635 if (cpu_feature & CPUID_TSC) 1636 tsc0_offset = rdtsc(); 1637 } 1638 1639 1640 /* 1641 * CPU TOPOLOGY DETECTION FUNCTIONS 1642 */ 1643 1644 /* Detect intel topology using CPUID 1645 * Ref: http://www.intel.com/Assets/PDF/appnote/241618.pdf, pg 41 1646 */ 1647 static void 1648 detect_intel_topology(int count_htt_cores) 1649 { 1650 int shift = 0; 1651 int ecx_index = 0; 1652 int core_plus_logical_bits = 0; 1653 int cores_per_package; 1654 int logical_per_package; 1655 int logical_per_core; 1656 unsigned int p[4]; 1657 1658 if (cpu_high >= 0xb) { 1659 goto FUNC_B; 1660 1661 } else if (cpu_high >= 0x4) { 1662 goto FUNC_4; 1663 1664 } else { 1665 core_bits = 0; 1666 for (shift = 0; (1 << shift) < count_htt_cores; ++shift) 1667 ; 1668 logical_CPU_bits = 1 << shift; 1669 return; 1670 } 1671 1672 FUNC_B: 1673 cpuid_count(0xb, FUNC_B_THREAD_LEVEL, p); 1674 1675 /* if 0xb not supported - fallback to 0x4 */ 1676 if (p[1] == 0 || (FUNC_B_TYPE(p[2]) != FUNC_B_THREAD_TYPE)) { 1677 goto FUNC_4; 1678 } 1679 1680 logical_CPU_bits = FUNC_B_BITS_SHIFT_NEXT_LEVEL(p[0]); 1681 1682 ecx_index = FUNC_B_THREAD_LEVEL + 1; 1683 do { 1684 cpuid_count(0xb, ecx_index, p); 1685 1686 /* Check for the Core type in the implemented sub leaves. */ 1687 if (FUNC_B_TYPE(p[2]) == FUNC_B_CORE_TYPE) { 1688 core_plus_logical_bits = FUNC_B_BITS_SHIFT_NEXT_LEVEL(p[0]); 1689 break; 1690 } 1691 1692 ecx_index++; 1693 1694 } while (FUNC_B_TYPE(p[2]) != FUNC_B_INVALID_TYPE); 1695 1696 core_bits = core_plus_logical_bits - logical_CPU_bits; 1697 1698 return; 1699 1700 FUNC_4: 1701 cpuid_count(0x4, 0, p); 1702 cores_per_package = FUNC_4_MAX_CORE_NO(p[0]) + 1; 1703 1704 logical_per_package = count_htt_cores; 1705 logical_per_core = logical_per_package / cores_per_package; 1706 1707 for (shift = 0; (1 << shift) < logical_per_core; ++shift) 1708 ; 1709 logical_CPU_bits = shift; 1710 1711 for (shift = 0; (1 << shift) < cores_per_package; ++shift) 1712 ; 1713 core_bits = shift; 1714 1715 return; 1716 } 1717 1718 /* Detect AMD topology using CPUID 1719 * Ref: http://support.amd.com/us/Embedded_TechDocs/25481.pdf, last page 1720 */ 1721 static void 1722 detect_amd_topology(int count_htt_cores) 1723 { 1724 int shift = 0; 1725 if ((cpu_feature & CPUID_HTT) && (amd_feature2 & AMDID2_CMP)) { 1726 if (cpu_procinfo2 & AMDID_COREID_SIZE) { 1727 core_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 1728 AMDID_COREID_SIZE_SHIFT; 1729 } else { 1730 core_bits = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; 1731 for (shift = 0; (1 << shift) < core_bits; ++shift) 1732 ; 1733 core_bits = shift; 1734 } 1735 1736 if (amd_feature2 & AMDID2_TOPOEXT) { 1737 u_int p[4]; 1738 int i; 1739 int type; 1740 int level; 1741 int share_count; 1742 for (i = 0; i < 256; ++i) { 1743 cpuid_count(0x8000001d, i, p); 1744 type = p[0] & 0x1f; 1745 level = (p[0] >> 5) & 0x7; 1746 share_count = 1 + ((p[0] >> 14) & 0xfff); 1747 1748 if (type == 0) 1749 break; 1750 if (bootverbose) 1751 kprintf("Topology probe i=%2d type=%d level=%d share_count=%d\n", 1752 i, type, level, share_count); 1753 if (type == 1 && share_count) { /* CPUID_TYPE_SMT */ 1754 for (shift = 0; (1 << shift) < count_htt_cores / share_count; ++shift) 1755 ; 1756 core_bits = shift; 1757 break; 1758 } 1759 } 1760 } 1761 1762 logical_CPU_bits = count_htt_cores >> core_bits; 1763 for (shift = 0; (1 << shift) < logical_CPU_bits; ++shift) 1764 ; 1765 logical_CPU_bits = shift; 1766 } else { 1767 for (shift = 0; (1 << shift) < count_htt_cores; ++shift) 1768 ; 1769 core_bits = shift; 1770 logical_CPU_bits = 0; 1771 } 1772 } 1773 1774 static void 1775 amd_get_compute_unit_id(void *arg) 1776 { 1777 u_int regs[4]; 1778 1779 do_cpuid(0x8000001e, regs); 1780 cpu_node_t * mynode = get_cpu_node_by_cpuid(mycpuid); 1781 1782 /* 1783 * AMD - CPUID Specification September 2010 1784 * page 34 - //ComputeUnitID = ebx[0:7]// 1785 */ 1786 mynode->compute_unit_id = regs[1] & 0xff; 1787 } 1788 1789 int 1790 fix_amd_topology(void) 1791 { 1792 cpumask_t mask; 1793 1794 if (cpu_vendor_id != CPU_VENDOR_AMD) 1795 return -1; 1796 if ((amd_feature2 & AMDID2_TOPOEXT) == 0) 1797 return -1; 1798 1799 CPUMASK_ASSALLONES(mask); 1800 lwkt_cpusync_simple(mask, amd_get_compute_unit_id, NULL); 1801 1802 kprintf("Compute unit iDS:\n"); 1803 int i; 1804 for (i = 0; i < ncpus; i++) { 1805 kprintf("%d-%d; \n", 1806 i, get_cpu_node_by_cpuid(i)->compute_unit_id); 1807 } 1808 return 0; 1809 } 1810 1811 /* 1812 * Calculate 1813 * - logical_CPU_bits 1814 * - core_bits 1815 * With the values above (for AMD or INTEL) we are able to generally 1816 * detect the CPU topology (number of cores for each level): 1817 * Ref: http://wiki.osdev.org/Detecting_CPU_Topology_(80x86) 1818 * Ref: http://www.multicoreinfo.com/research/papers/whitepapers/Intel-detect-topology.pdf 1819 */ 1820 void 1821 detect_cpu_topology(void) 1822 { 1823 static int topology_detected = 0; 1824 int count = 0; 1825 1826 if (topology_detected) 1827 goto OUT; 1828 if ((cpu_feature & CPUID_HTT) == 0) { 1829 core_bits = 0; 1830 logical_CPU_bits = 0; 1831 goto OUT; 1832 } 1833 count = (cpu_procinfo & CPUID_HTT_CORES) >> CPUID_HTT_CORE_SHIFT; 1834 1835 if (cpu_vendor_id == CPU_VENDOR_INTEL) 1836 detect_intel_topology(count); 1837 else if (cpu_vendor_id == CPU_VENDOR_AMD) 1838 detect_amd_topology(count); 1839 topology_detected = 1; 1840 1841 OUT: 1842 if (bootverbose) { 1843 kprintf("Bits within APICID: logical_CPU_bits: %d; " 1844 "core_bits: %d\n", 1845 logical_CPU_bits, core_bits); 1846 } 1847 } 1848 1849 /* 1850 * Interface functions to calculate chip_ID, 1851 * core_number and logical_number 1852 * Ref: http://wiki.osdev.org/Detecting_CPU_Topology_(80x86) 1853 */ 1854 int 1855 get_chip_ID(int cpuid) 1856 { 1857 return get_apicid_from_cpuid(cpuid) >> 1858 (logical_CPU_bits + core_bits); 1859 } 1860 1861 int 1862 get_chip_ID_from_APICID(int apicid) 1863 { 1864 return apicid >> (logical_CPU_bits + core_bits); 1865 } 1866 1867 int 1868 get_core_number_within_chip(int cpuid) 1869 { 1870 return ((get_apicid_from_cpuid(cpuid) >> logical_CPU_bits) & 1871 ((1 << core_bits) - 1)); 1872 } 1873 1874 int 1875 get_logical_CPU_number_within_core(int cpuid) 1876 { 1877 return (get_apicid_from_cpuid(cpuid) & 1878 ((1 << logical_CPU_bits) - 1)); 1879 } 1880