xref: /freebsd/sys/i386/i386/mp_machdep.c (revision 39beb93c)
1 /*-
2  * Copyright (c) 1996, by Steve Passe
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. The name of the developer may NOT be used to endorse or promote products
11  *    derived from this software without specific prior written permission.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 #include "opt_apic.h"
30 #include "opt_cpu.h"
31 #include "opt_kstack_pages.h"
32 #include "opt_mp_watchdog.h"
33 #include "opt_sched.h"
34 #include "opt_smp.h"
35 
36 #if !defined(lint)
37 #if !defined(SMP)
38 #error How did you get here?
39 #endif
40 
41 #ifndef DEV_APIC
42 #error The apic device is required for SMP, add "device apic" to your config file.
43 #endif
44 #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
45 #error SMP not supported with CPU_DISABLE_CMPXCHG
46 #endif
47 #endif /* not lint */
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/bus.h>
52 #include <sys/cons.h>	/* cngetc() */
53 #ifdef GPROF
54 #include <sys/gmon.h>
55 #endif
56 #include <sys/kernel.h>
57 #include <sys/ktr.h>
58 #include <sys/lock.h>
59 #include <sys/malloc.h>
60 #include <sys/memrange.h>
61 #include <sys/mutex.h>
62 #include <sys/pcpu.h>
63 #include <sys/proc.h>
64 #include <sys/sched.h>
65 #include <sys/smp.h>
66 #include <sys/sysctl.h>
67 
68 #include <vm/vm.h>
69 #include <vm/vm_param.h>
70 #include <vm/pmap.h>
71 #include <vm/vm_kern.h>
72 #include <vm/vm_extern.h>
73 
74 #include <machine/apicreg.h>
75 #include <machine/cputypes.h>
76 #include <machine/md_var.h>
77 #include <machine/mp_watchdog.h>
78 #include <machine/pcb.h>
79 #include <machine/psl.h>
80 #include <machine/smp.h>
81 #include <machine/specialreg.h>
82 
83 #define WARMBOOT_TARGET		0
84 #define WARMBOOT_OFF		(KERNBASE + 0x0467)
85 #define WARMBOOT_SEG		(KERNBASE + 0x0469)
86 
87 #define CMOS_REG		(0x70)
88 #define CMOS_DATA		(0x71)
89 #define BIOS_RESET		(0x0f)
90 #define BIOS_WARM		(0x0a)
91 
92 /*
93  * this code MUST be enabled here and in mpboot.s.
94  * it follows the very early stages of AP boot by placing values in CMOS ram.
95  * it NORMALLY will never be needed and thus the primitive method for enabling.
96  *
97 #define CHECK_POINTS
98  */
99 
100 #if defined(CHECK_POINTS) && !defined(PC98)
101 #define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
102 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
103 
104 #define CHECK_INIT(D);				\
105 	CHECK_WRITE(0x34, (D));			\
106 	CHECK_WRITE(0x35, (D));			\
107 	CHECK_WRITE(0x36, (D));			\
108 	CHECK_WRITE(0x37, (D));			\
109 	CHECK_WRITE(0x38, (D));			\
110 	CHECK_WRITE(0x39, (D));
111 
112 #define CHECK_PRINT(S);				\
113 	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
114 	   (S),					\
115 	   CHECK_READ(0x34),			\
116 	   CHECK_READ(0x35),			\
117 	   CHECK_READ(0x36),			\
118 	   CHECK_READ(0x37),			\
119 	   CHECK_READ(0x38),			\
120 	   CHECK_READ(0x39));
121 
122 #else				/* CHECK_POINTS */
123 
124 #define CHECK_INIT(D)
125 #define CHECK_PRINT(S)
126 #define CHECK_WRITE(A, D)
127 
128 #endif				/* CHECK_POINTS */
129 
130 /* lock region used by kernel profiling */
131 int	mcount_lock;
132 
133 int	mp_naps;		/* # of Applications processors */
134 int	boot_cpu_id = -1;	/* designated BSP */
135 
136 extern	struct pcpu __pcpu[];
137 
138 /* AP uses this during bootstrap.  Do not staticize.  */
139 char *bootSTK;
140 static int bootAP;
141 
142 /* Free these after use */
143 void *bootstacks[MAXCPU];
144 
145 /* Hotwire a 0->4MB V==P mapping */
146 extern pt_entry_t *KPTphys;
147 
148 struct pcb stoppcbs[MAXCPU];
149 
150 /* Variables needed for SMP tlb shootdown. */
151 vm_offset_t smp_tlb_addr1;
152 vm_offset_t smp_tlb_addr2;
153 volatile int smp_tlb_wait;
154 
155 #ifdef STOP_NMI
156 volatile cpumask_t ipi_nmi_pending;
157 
158 static void	ipi_nmi_selected(u_int32_t cpus);
159 #endif
160 
161 #ifdef COUNT_IPIS
162 /* Interrupt counts. */
163 static u_long *ipi_preempt_counts[MAXCPU];
164 static u_long *ipi_ast_counts[MAXCPU];
165 u_long *ipi_invltlb_counts[MAXCPU];
166 u_long *ipi_invlrng_counts[MAXCPU];
167 u_long *ipi_invlpg_counts[MAXCPU];
168 u_long *ipi_invlcache_counts[MAXCPU];
169 u_long *ipi_rendezvous_counts[MAXCPU];
170 u_long *ipi_lazypmap_counts[MAXCPU];
171 #endif
172 
173 /*
174  * Local data and functions.
175  */
176 
177 #ifdef STOP_NMI
178 /*
179  * Provide an alternate method of stopping other CPUs. If another CPU has
180  * disabled interrupts the conventional STOP IPI will be blocked. This
181  * NMI-based stop should get through in that case.
182  */
183 static int stop_cpus_with_nmi = 1;
184 SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW,
185     &stop_cpus_with_nmi, 0, "");
186 TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi);
187 #else
188 #define	stop_cpus_with_nmi	0
189 #endif
190 
191 static u_int logical_cpus;
192 
193 /* used to hold the AP's until we are ready to release them */
194 static struct mtx ap_boot_mtx;
195 
196 /* Set to 1 once we're ready to let the APs out of the pen. */
197 static volatile int aps_ready = 0;
198 
199 /*
200  * Store data from cpu_add() until later in the boot when we actually setup
201  * the APs.
202  */
203 struct cpu_info {
204 	int	cpu_present:1;
205 	int	cpu_bsp:1;
206 	int	cpu_disabled:1;
207 	int	cpu_hyperthread:1;
208 } static cpu_info[MAX_APIC_ID + 1];
209 int cpu_apic_ids[MAXCPU];
210 int apic_cpuids[MAX_APIC_ID + 1];
211 
212 /* Holds pending bitmap based IPIs per CPU */
213 static volatile u_int cpu_ipi_pending[MAXCPU];
214 
215 static u_int boot_address;
216 
217 static void	assign_cpu_ids(void);
218 static void	install_ap_tramp(void);
219 static void	set_interrupt_apic_ids(void);
220 static int	start_all_aps(void);
221 static int	start_ap(int apic_id);
222 static void	release_aps(void *dummy);
223 
224 static int	hlt_logical_cpus;
225 static u_int	hyperthreading_cpus;
226 static cpumask_t	hyperthreading_cpus_mask;
227 static int	hyperthreading_allowed = 1;
228 static struct	sysctl_ctx_list logical_cpu_clist;
229 
230 static void
231 mem_range_AP_init(void)
232 {
233 	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
234 		mem_range_softc.mr_op->initAP(&mem_range_softc);
235 }
236 
237 struct cpu_group *
238 cpu_topo(void)
239 {
240 	if (cpu_cores == 0)
241 		cpu_cores = 1;
242 	if (cpu_logical == 0)
243 		cpu_logical = 1;
244 	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
245 		printf("WARNING: Non-uniform processors.\n");
246 		printf("WARNING: Using suboptimal topology.\n");
247 		return (smp_topo_none());
248 	}
249 	/*
250 	 * No multi-core or hyper-threaded.
251 	 */
252 	if (cpu_logical * cpu_cores == 1)
253 		return (smp_topo_none());
254 	/*
255 	 * Only HTT no multi-core.
256 	 */
257 	if (cpu_logical > 1 && cpu_cores == 1)
258 		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
259 	/*
260 	 * Only multi-core no HTT.
261 	 */
262 	if (cpu_cores > 1 && cpu_logical == 1)
263 		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
264 	/*
265 	 * Both HTT and multi-core.
266 	 */
267 	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
268 	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
269 }
270 
271 
272 /*
273  * Calculate usable address in base memory for AP trampoline code.
274  */
275 u_int
276 mp_bootaddress(u_int basemem)
277 {
278 
279 	boot_address = trunc_page(basemem);	/* round down to 4k boundary */
280 	if ((basemem - boot_address) < bootMP_size)
281 		boot_address -= PAGE_SIZE;	/* not enough, lower by 4k */
282 
283 	return boot_address;
284 }
285 
286 void
287 cpu_add(u_int apic_id, char boot_cpu)
288 {
289 
290 	if (apic_id > MAX_APIC_ID) {
291 		panic("SMP: APIC ID %d too high", apic_id);
292 		return;
293 	}
294 	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
295 	    apic_id));
296 	cpu_info[apic_id].cpu_present = 1;
297 	if (boot_cpu) {
298 		KASSERT(boot_cpu_id == -1,
299 		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
300 		    boot_cpu_id));
301 		boot_cpu_id = apic_id;
302 		cpu_info[apic_id].cpu_bsp = 1;
303 	}
304 	if (mp_ncpus < MAXCPU)
305 		mp_ncpus++;
306 	if (bootverbose)
307 		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
308 		    "AP");
309 }
310 
311 void
312 cpu_mp_setmaxid(void)
313 {
314 
315 	mp_maxid = MAXCPU - 1;
316 }
317 
318 int
319 cpu_mp_probe(void)
320 {
321 
322 	/*
323 	 * Always record BSP in CPU map so that the mbuf init code works
324 	 * correctly.
325 	 */
326 	all_cpus = 1;
327 	if (mp_ncpus == 0) {
328 		/*
329 		 * No CPUs were found, so this must be a UP system.  Setup
330 		 * the variables to represent a system with a single CPU
331 		 * with an id of 0.
332 		 */
333 		mp_ncpus = 1;
334 		return (0);
335 	}
336 
337 	/* At least one CPU was found. */
338 	if (mp_ncpus == 1) {
339 		/*
340 		 * One CPU was found, so this must be a UP system with
341 		 * an I/O APIC.
342 		 */
343 		return (0);
344 	}
345 
346 	/* At least two CPUs were found. */
347 	return (1);
348 }
349 
350 /*
351  * Initialize the IPI handlers and start up the AP's.
352  */
353 void
354 cpu_mp_start(void)
355 {
356 	int i;
357 	u_int threads_per_cache, p[4];
358 
359 	/* Initialize the logical ID to APIC ID table. */
360 	for (i = 0; i < MAXCPU; i++) {
361 		cpu_apic_ids[i] = -1;
362 		cpu_ipi_pending[i] = 0;
363 	}
364 
365 	/* Install an inter-CPU IPI for TLB invalidation */
366 	setidt(IPI_INVLTLB, IDTVEC(invltlb),
367 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
368 	setidt(IPI_INVLPG, IDTVEC(invlpg),
369 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
370 	setidt(IPI_INVLRNG, IDTVEC(invlrng),
371 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
372 
373 	/* Install an inter-CPU IPI for cache invalidation. */
374 	setidt(IPI_INVLCACHE, IDTVEC(invlcache),
375 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
376 
377 	/* Install an inter-CPU IPI for lazy pmap release */
378 	setidt(IPI_LAZYPMAP, IDTVEC(lazypmap),
379 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
380 
381 	/* Install an inter-CPU IPI for all-CPU rendezvous */
382 	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
383 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
384 
385 	/* Install generic inter-CPU IPI handler */
386 	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
387 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
388 
389 	/* Install an inter-CPU IPI for CPU stop/restart */
390 	setidt(IPI_STOP, IDTVEC(cpustop),
391 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
392 
393 
394 	/* Set boot_cpu_id if needed. */
395 	if (boot_cpu_id == -1) {
396 		boot_cpu_id = PCPU_GET(apic_id);
397 		cpu_info[boot_cpu_id].cpu_bsp = 1;
398 	} else
399 		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
400 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
401 	cpu_apic_ids[0] = boot_cpu_id;
402 	apic_cpuids[boot_cpu_id] = 0;
403 
404 	/* Setup the initial logical CPUs info. */
405 	logical_cpus = logical_cpus_mask = 0;
406 	if (cpu_feature & CPUID_HTT)
407 		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
408 
409 	/*
410 	 * Work out if hyperthreading is *really* enabled.  This
411 	 * is made really ugly by the fact that processors lie: Dual
412 	 * core processors claim to be hyperthreaded even when they're
413 	 * not, presumably because they want to be treated the same
414 	 * way as HTT with respect to per-cpu software licensing.
415 	 * At the time of writing (May 12, 2005) the only hyperthreaded
416 	 * cpus are from Intel, and Intel's dual-core processors can be
417 	 * identified via the "deterministic cache parameters" cpuid
418 	 * calls.
419 	 */
420 	/*
421 	 * First determine if this is an Intel processor which claims
422 	 * to have hyperthreading support.
423 	 */
424 	if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_INTEL) {
425 		/*
426 		 * If the "deterministic cache parameters" cpuid calls
427 		 * are available, use them.
428 		 */
429 		if (cpu_high >= 4) {
430 			/* Ask the processor about the L1 cache. */
431 			for (i = 0; i < 1; i++) {
432 				cpuid_count(4, i, p);
433 				threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1;
434 				if (hyperthreading_cpus < threads_per_cache)
435 					hyperthreading_cpus = threads_per_cache;
436 				if ((p[0] & 0x1f) == 0)
437 					break;
438 			}
439 		}
440 
441 		/*
442 		 * If the deterministic cache parameters are not
443 		 * available, or if no caches were reported to exist,
444 		 * just accept what the HTT flag indicated.
445 		 */
446 		if (hyperthreading_cpus == 0)
447 			hyperthreading_cpus = logical_cpus;
448 	}
449 
450 	assign_cpu_ids();
451 
452 	/* Start each Application Processor */
453 	start_all_aps();
454 
455 	set_interrupt_apic_ids();
456 }
457 
458 
459 /*
460  * Print various information about the SMP system hardware and setup.
461  */
462 void
463 cpu_mp_announce(void)
464 {
465 	int i, x;
466 	const char *hyperthread;
467 
468 	/* List CPUs */
469 	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
470 	for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
471 		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
472 			continue;
473 		if (cpu_info[x].cpu_hyperthread) {
474 			hyperthread = "/HT";
475 		} else {
476 			hyperthread = "";
477 		}
478 		if (cpu_info[x].cpu_disabled)
479 			printf("  cpu (AP%s): APIC ID: %2d (disabled)\n",
480 			    hyperthread, x);
481 		else {
482 			KASSERT(i < mp_ncpus,
483 			    ("mp_ncpus and actual cpus are out of whack"));
484 			printf(" cpu%d (AP%s): APIC ID: %2d\n", i++,
485 			    hyperthread, x);
486 		}
487 	}
488 }
489 
490 /*
491  * AP CPU's call this to initialize themselves.
492  */
493 void
494 init_secondary(void)
495 {
496 	struct pcpu *pc;
497 	vm_offset_t addr;
498 	int	gsel_tss;
499 	int	x, myid;
500 	u_int	cr0;
501 
502 	/* bootAP is set in start_ap() to our ID. */
503 	myid = bootAP;
504 
505 	/* Get per-cpu data */
506 	pc = &__pcpu[myid];
507 
508 	/* prime data page for it to use */
509 	pcpu_init(pc, myid, sizeof(struct pcpu));
510 	pc->pc_apic_id = cpu_apic_ids[myid];
511 	pc->pc_prvspace = pc;
512 	pc->pc_curthread = 0;
513 
514 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
515 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
516 
517 	for (x = 0; x < NGDT; x++) {
518 		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
519 	}
520 
521 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
522 	r_gdt.rd_base = (int) &gdt[myid * NGDT];
523 	lgdt(&r_gdt);			/* does magic intra-segment return */
524 
525 	lidt(&r_idt);
526 
527 	lldt(_default_ldt);
528 	PCPU_SET(currentldt, _default_ldt);
529 
530 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
531 	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
532 	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
533 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
534 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
535 	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
536 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
537 	ltr(gsel_tss);
538 
539 	PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd);
540 
541 	/*
542 	 * Set to a known state:
543 	 * Set by mpboot.s: CR0_PG, CR0_PE
544 	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
545 	 */
546 	cr0 = rcr0();
547 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
548 	load_cr0(cr0);
549 	CHECK_WRITE(0x38, 5);
550 
551 	/* Disable local APIC just to be sure. */
552 	lapic_disable();
553 
554 	/* signal our startup to the BSP. */
555 	mp_naps++;
556 	CHECK_WRITE(0x39, 6);
557 
558 	/* Spin until the BSP releases the AP's. */
559 	while (!aps_ready)
560 		ia32_pause();
561 
562 	/* BSP may have changed PTD while we were waiting */
563 	invltlb();
564 	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
565 		invlpg(addr);
566 
567 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
568 	lidt(&r_idt);
569 #endif
570 
571 	/* Initialize the PAT MSR if present. */
572 	pmap_init_pat();
573 
574 	/* set up CPU registers and state */
575 	cpu_setregs();
576 
577 	/* set up FPU state on the AP */
578 	npxinit(__INITIAL_NPXCW__);
579 
580 	/* set up SSE registers */
581 	enable_sse();
582 
583 #ifdef PAE
584 	/* Enable the PTE no-execute bit. */
585 	if ((amd_feature & AMDID_NX) != 0) {
586 		uint64_t msr;
587 
588 		msr = rdmsr(MSR_EFER) | EFER_NXE;
589 		wrmsr(MSR_EFER, msr);
590 	}
591 #endif
592 
593 	/* A quick check from sanity claus */
594 	if (PCPU_GET(apic_id) != lapic_id()) {
595 		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
596 		printf("SMP: actual apic_id = %d\n", lapic_id());
597 		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
598 		panic("cpuid mismatch! boom!!");
599 	}
600 
601 	/* Initialize curthread. */
602 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
603 	PCPU_SET(curthread, PCPU_GET(idlethread));
604 
605 	mtx_lock_spin(&ap_boot_mtx);
606 
607 	/* Init local apic for irq's */
608 	lapic_setup(1);
609 
610 	/* Set memory range attributes for this CPU to match the BSP */
611 	mem_range_AP_init();
612 
613 	smp_cpus++;
614 
615 	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
616 	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
617 
618 	/* Determine if we are a logical CPU. */
619 	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
620 		logical_cpus_mask |= PCPU_GET(cpumask);
621 
622 	/* Determine if we are a hyperthread. */
623 	if (hyperthreading_cpus > 1 &&
624 	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
625 		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
626 
627 	/* Build our map of 'other' CPUs. */
628 	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
629 
630 	if (bootverbose)
631 		lapic_dump("AP");
632 
633 	if (smp_cpus == mp_ncpus) {
634 		/* enable IPI's, tlb shootdown, freezes etc */
635 		atomic_store_rel_int(&smp_started, 1);
636 		smp_active = 1;	 /* historic */
637 	}
638 
639 	mtx_unlock_spin(&ap_boot_mtx);
640 
641 	/* wait until all the AP's are up */
642 	while (smp_started == 0)
643 		ia32_pause();
644 
645 	/* enter the scheduler */
646 	sched_throw(NULL);
647 
648 	panic("scheduler returned us to %s", __func__);
649 	/* NOTREACHED */
650 }
651 
652 /*******************************************************************
653  * local functions and data
654  */
655 
656 /*
657  * We tell the I/O APIC code about all the CPUs we want to receive
658  * interrupts.  If we don't want certain CPUs to receive IRQs we
659  * can simply not tell the I/O APIC code about them in this function.
660  * We also do not tell it about the BSP since it tells itself about
661  * the BSP internally to work with UP kernels and on UP machines.
662  */
663 static void
664 set_interrupt_apic_ids(void)
665 {
666 	u_int i, apic_id;
667 
668 	for (i = 0; i < MAXCPU; i++) {
669 		apic_id = cpu_apic_ids[i];
670 		if (apic_id == -1)
671 			continue;
672 		if (cpu_info[apic_id].cpu_bsp)
673 			continue;
674 		if (cpu_info[apic_id].cpu_disabled)
675 			continue;
676 
677 		/* Don't let hyperthreads service interrupts. */
678 		if (hyperthreading_cpus > 1 &&
679 		    apic_id % hyperthreading_cpus != 0)
680 			continue;
681 
682 		intr_add_cpu(i);
683 	}
684 }
685 
686 /*
687  * Assign logical CPU IDs to local APICs.
688  */
689 static void
690 assign_cpu_ids(void)
691 {
692 	u_int i;
693 
694 	TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
695 	    &hyperthreading_allowed);
696 
697 	/* Check for explicitly disabled CPUs. */
698 	for (i = 0; i <= MAX_APIC_ID; i++) {
699 		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
700 			continue;
701 
702 		if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) {
703 			cpu_info[i].cpu_hyperthread = 1;
704 #if defined(SCHED_ULE)
705 			/*
706 			 * Don't use HT CPU if it has been disabled by a
707 			 * tunable.
708 			 */
709 			if (hyperthreading_allowed == 0) {
710 				cpu_info[i].cpu_disabled = 1;
711 				continue;
712 			}
713 #endif
714 		}
715 
716 		/* Don't use this CPU if it has been disabled by a tunable. */
717 		if (resource_disabled("lapic", i)) {
718 			cpu_info[i].cpu_disabled = 1;
719 			continue;
720 		}
721 	}
722 
723 	/*
724 	 * Assign CPU IDs to local APIC IDs and disable any CPUs
725 	 * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
726 	 * so we only have to assign IDs for APs.
727 	 */
728 	mp_ncpus = 1;
729 	for (i = 0; i <= MAX_APIC_ID; i++) {
730 		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
731 		    cpu_info[i].cpu_disabled)
732 			continue;
733 
734 		if (mp_ncpus < MAXCPU) {
735 			cpu_apic_ids[mp_ncpus] = i;
736 			apic_cpuids[i] = mp_ncpus;
737 			mp_ncpus++;
738 		} else
739 			cpu_info[i].cpu_disabled = 1;
740 	}
741 	KASSERT(mp_maxid >= mp_ncpus - 1,
742 	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
743 	    mp_ncpus));
744 }
745 
746 /*
747  * start each AP in our list
748  */
749 /* Lowest 1MB is already mapped: don't touch*/
750 #define TMPMAP_START 1
751 static int
752 start_all_aps(void)
753 {
754 #ifndef PC98
755 	u_char mpbiosreason;
756 #endif
757 	uintptr_t kptbase;
758 	u_int32_t mpbioswarmvec;
759 	int apic_id, cpu, i;
760 
761 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
762 
763 	/* install the AP 1st level boot code */
764 	install_ap_tramp();
765 
766 	/* save the current value of the warm-start vector */
767 	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
768 #ifndef PC98
769 	outb(CMOS_REG, BIOS_RESET);
770 	mpbiosreason = inb(CMOS_DATA);
771 #endif
772 
773 	/* set up temporary P==V mapping for AP boot */
774 	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
775 
776 	kptbase = (uintptr_t)(void *)KPTphys;
777 	for (i = TMPMAP_START; i < NKPT; i++)
778 		PTD[i] = (pd_entry_t)(PG_V | PG_RW |
779 		    ((kptbase + i * PAGE_SIZE) & PG_FRAME));
780 	invltlb();
781 
782 	/* start each AP */
783 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
784 		apic_id = cpu_apic_ids[cpu];
785 
786 		/* allocate and set up a boot stack data page */
787 		bootstacks[cpu] = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
788 
789 		/* setup a vector to our boot code */
790 		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
791 		*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
792 #ifndef PC98
793 		outb(CMOS_REG, BIOS_RESET);
794 		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
795 #endif
796 
797 		bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 4;
798 		bootAP = cpu;
799 
800 		/* attempt to start the Application Processor */
801 		CHECK_INIT(99);	/* setup checkpoints */
802 		if (!start_ap(apic_id)) {
803 			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
804 			CHECK_PRINT("trace");	/* show checkpoints */
805 			/* better panic as the AP may be running loose */
806 			printf("panic y/n? [y] ");
807 			if (cngetc() != 'n')
808 				panic("bye-bye");
809 		}
810 		CHECK_PRINT("trace");		/* show checkpoints */
811 
812 		all_cpus |= (1 << cpu);		/* record AP in CPU map */
813 	}
814 
815 	/* build our map of 'other' CPUs */
816 	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
817 
818 	/* restore the warmstart vector */
819 	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
820 
821 #ifndef PC98
822 	outb(CMOS_REG, BIOS_RESET);
823 	outb(CMOS_DATA, mpbiosreason);
824 #endif
825 
826 	/* Undo V==P hack from above */
827 	for (i = TMPMAP_START; i < NKPT; i++)
828 		PTD[i] = 0;
829 	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
830 
831 	/* number of APs actually started */
832 	return mp_naps;
833 }
834 
835 /*
836  * load the 1st level AP boot code into base memory.
837  */
838 
839 /* targets for relocation */
840 extern void bigJump(void);
841 extern void bootCodeSeg(void);
842 extern void bootDataSeg(void);
843 extern void MPentry(void);
844 extern u_int MP_GDT;
845 extern u_int mp_gdtbase;
846 
847 static void
848 install_ap_tramp(void)
849 {
850 	int     x;
851 	int     size = *(int *) ((u_long) & bootMP_size);
852 	vm_offset_t va = boot_address + KERNBASE;
853 	u_char *src = (u_char *) ((u_long) bootMP);
854 	u_char *dst = (u_char *) va;
855 	u_int   boot_base = (u_int) bootMP;
856 	u_int8_t *dst8;
857 	u_int16_t *dst16;
858 	u_int32_t *dst32;
859 
860 	KASSERT (size <= PAGE_SIZE,
861 	    ("'size' do not fit into PAGE_SIZE, as expected."));
862 	pmap_kenter(va, boot_address);
863 	pmap_invalidate_page (kernel_pmap, va);
864 	for (x = 0; x < size; ++x)
865 		*dst++ = *src++;
866 
867 	/*
868 	 * modify addresses in code we just moved to basemem. unfortunately we
869 	 * need fairly detailed info about mpboot.s for this to work.  changes
870 	 * to mpboot.s might require changes here.
871 	 */
872 
873 	/* boot code is located in KERNEL space */
874 	dst = (u_char *) va;
875 
876 	/* modify the lgdt arg */
877 	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
878 	*dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
879 
880 	/* modify the ljmp target for MPentry() */
881 	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
882 	*dst32 = ((u_int) MPentry - KERNBASE);
883 
884 	/* modify the target for boot code segment */
885 	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
886 	dst8 = (u_int8_t *) (dst16 + 1);
887 	*dst16 = (u_int) boot_address & 0xffff;
888 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
889 
890 	/* modify the target for boot data segment */
891 	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
892 	dst8 = (u_int8_t *) (dst16 + 1);
893 	*dst16 = (u_int) boot_address & 0xffff;
894 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
895 }
896 
897 /*
898  * This function starts the AP (application processor) identified
899  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
900  * to accomplish this.  This is necessary because of the nuances
901  * of the different hardware we might encounter.  It isn't pretty,
902  * but it seems to work.
903  */
904 static int
905 start_ap(int apic_id)
906 {
907 	int vector, ms;
908 	int cpus;
909 
910 	/* calculate the vector */
911 	vector = (boot_address >> 12) & 0xff;
912 
913 	/* used as a watchpoint to signal AP startup */
914 	cpus = mp_naps;
915 
916 	/*
917 	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
918 	 * and running the target CPU. OR this INIT IPI might be latched (P5
919 	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
920 	 * ignored.
921 	 */
922 
923 	/* do an INIT IPI: assert RESET */
924 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
925 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
926 
927 	/* wait for pending status end */
928 	lapic_ipi_wait(-1);
929 
930 	/* do an INIT IPI: deassert RESET */
931 	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
932 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
933 
934 	/* wait for pending status end */
935 	DELAY(10000);		/* wait ~10mS */
936 	lapic_ipi_wait(-1);
937 
938 	/*
939 	 * next we do a STARTUP IPI: the previous INIT IPI might still be
940 	 * latched, (P5 bug) this 1st STARTUP would then terminate
941 	 * immediately, and the previously started INIT IPI would continue. OR
942 	 * the previous INIT IPI has already run. and this STARTUP IPI will
943 	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
944 	 * will run.
945 	 */
946 
947 	/* do a STARTUP IPI */
948 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
949 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
950 	    vector, apic_id);
951 	lapic_ipi_wait(-1);
952 	DELAY(200);		/* wait ~200uS */
953 
954 	/*
955 	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
956 	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
957 	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
958 	 * recognized after hardware RESET or INIT IPI.
959 	 */
960 
961 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
962 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
963 	    vector, apic_id);
964 	lapic_ipi_wait(-1);
965 	DELAY(200);		/* wait ~200uS */
966 
967 	/* Wait up to 5 seconds for it to start. */
968 	for (ms = 0; ms < 5000; ms++) {
969 		if (mp_naps > cpus)
970 			return 1;	/* return SUCCESS */
971 		DELAY(1000);
972 	}
973 	return 0;		/* return FAILURE */
974 }
975 
976 #ifdef COUNT_XINVLTLB_HITS
977 u_int xhits_gbl[MAXCPU];
978 u_int xhits_pg[MAXCPU];
979 u_int xhits_rng[MAXCPU];
980 SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
981 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
982     sizeof(xhits_gbl), "IU", "");
983 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
984     sizeof(xhits_pg), "IU", "");
985 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
986     sizeof(xhits_rng), "IU", "");
987 
988 u_int ipi_global;
989 u_int ipi_page;
990 u_int ipi_range;
991 u_int ipi_range_size;
992 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
993 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
994 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
995 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
996     0, "");
997 
998 u_int ipi_masked_global;
999 u_int ipi_masked_page;
1000 u_int ipi_masked_range;
1001 u_int ipi_masked_range_size;
1002 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
1003     &ipi_masked_global, 0, "");
1004 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
1005     &ipi_masked_page, 0, "");
1006 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
1007     &ipi_masked_range, 0, "");
1008 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
1009     &ipi_masked_range_size, 0, "");
1010 #endif /* COUNT_XINVLTLB_HITS */
1011 
1012 /*
1013  * Flush the TLB on all other CPU's
1014  */
1015 static void
1016 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1017 {
1018 	u_int ncpu;
1019 
1020 	ncpu = mp_ncpus - 1;	/* does not shootdown self */
1021 	if (ncpu < 1)
1022 		return;		/* no other cpus */
1023 	if (!(read_eflags() & PSL_I))
1024 		panic("%s: interrupts disabled", __func__);
1025 	mtx_lock_spin(&smp_ipi_mtx);
1026 	smp_tlb_addr1 = addr1;
1027 	smp_tlb_addr2 = addr2;
1028 	atomic_store_rel_int(&smp_tlb_wait, 0);
1029 	ipi_all_but_self(vector);
1030 	while (smp_tlb_wait < ncpu)
1031 		ia32_pause();
1032 	mtx_unlock_spin(&smp_ipi_mtx);
1033 }
1034 
1035 static void
1036 smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1037 {
1038 	int ncpu, othercpus;
1039 
1040 	othercpus = mp_ncpus - 1;
1041 	if (mask == (u_int)-1) {
1042 		ncpu = othercpus;
1043 		if (ncpu < 1)
1044 			return;
1045 	} else {
1046 		mask &= ~PCPU_GET(cpumask);
1047 		if (mask == 0)
1048 			return;
1049 		ncpu = bitcount32(mask);
1050 		if (ncpu > othercpus) {
1051 			/* XXX this should be a panic offence */
1052 			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1053 			    ncpu, othercpus);
1054 			ncpu = othercpus;
1055 		}
1056 		/* XXX should be a panic, implied by mask == 0 above */
1057 		if (ncpu < 1)
1058 			return;
1059 	}
1060 	if (!(read_eflags() & PSL_I))
1061 		panic("%s: interrupts disabled", __func__);
1062 	mtx_lock_spin(&smp_ipi_mtx);
1063 	smp_tlb_addr1 = addr1;
1064 	smp_tlb_addr2 = addr2;
1065 	atomic_store_rel_int(&smp_tlb_wait, 0);
1066 	if (mask == (u_int)-1)
1067 		ipi_all_but_self(vector);
1068 	else
1069 		ipi_selected(mask, vector);
1070 	while (smp_tlb_wait < ncpu)
1071 		ia32_pause();
1072 	mtx_unlock_spin(&smp_ipi_mtx);
1073 }
1074 
1075 void
1076 smp_cache_flush(void)
1077 {
1078 
1079 	if (smp_started)
1080 		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1081 }
1082 
1083 void
1084 smp_invltlb(void)
1085 {
1086 
1087 	if (smp_started) {
1088 		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1089 #ifdef COUNT_XINVLTLB_HITS
1090 		ipi_global++;
1091 #endif
1092 	}
1093 }
1094 
1095 void
1096 smp_invlpg(vm_offset_t addr)
1097 {
1098 
1099 	if (smp_started) {
1100 		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1101 #ifdef COUNT_XINVLTLB_HITS
1102 		ipi_page++;
1103 #endif
1104 	}
1105 }
1106 
1107 void
1108 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1109 {
1110 
1111 	if (smp_started) {
1112 		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1113 #ifdef COUNT_XINVLTLB_HITS
1114 		ipi_range++;
1115 		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
1116 #endif
1117 	}
1118 }
1119 
1120 void
1121 smp_masked_invltlb(u_int mask)
1122 {
1123 
1124 	if (smp_started) {
1125 		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1126 #ifdef COUNT_XINVLTLB_HITS
1127 		ipi_masked_global++;
1128 #endif
1129 	}
1130 }
1131 
1132 void
1133 smp_masked_invlpg(u_int mask, vm_offset_t addr)
1134 {
1135 
1136 	if (smp_started) {
1137 		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1138 #ifdef COUNT_XINVLTLB_HITS
1139 		ipi_masked_page++;
1140 #endif
1141 	}
1142 }
1143 
1144 void
1145 smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
1146 {
1147 
1148 	if (smp_started) {
1149 		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1150 #ifdef COUNT_XINVLTLB_HITS
1151 		ipi_masked_range++;
1152 		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
1153 #endif
1154 	}
1155 }
1156 
1157 void
1158 ipi_bitmap_handler(struct trapframe frame)
1159 {
1160 	int cpu = PCPU_GET(cpuid);
1161 	u_int ipi_bitmap;
1162 
1163 	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1164 
1165 	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1166 #ifdef COUNT_IPIS
1167 		(*ipi_preempt_counts[cpu])++;
1168 #endif
1169 		sched_preempt(curthread);
1170 	}
1171 
1172 	if (ipi_bitmap & (1 << IPI_AST)) {
1173 #ifdef COUNT_IPIS
1174 		(*ipi_ast_counts[cpu])++;
1175 #endif
1176 		/* Nothing to do for AST */
1177 	}
1178 }
1179 
1180 /*
1181  * send an IPI to a set of cpus.
1182  */
1183 void
1184 ipi_selected(u_int32_t cpus, u_int ipi)
1185 {
1186 	int cpu;
1187 	u_int bitmap = 0;
1188 	u_int old_pending;
1189 	u_int new_pending;
1190 
1191 	if (IPI_IS_BITMAPED(ipi)) {
1192 		bitmap = 1 << ipi;
1193 		ipi = IPI_BITMAP_VECTOR;
1194 	}
1195 
1196 #ifdef STOP_NMI
1197 	if (ipi == IPI_STOP && stop_cpus_with_nmi) {
1198 		ipi_nmi_selected(cpus);
1199 		return;
1200 	}
1201 #endif
1202 	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1203 	while ((cpu = ffs(cpus)) != 0) {
1204 		cpu--;
1205 		cpus &= ~(1 << cpu);
1206 
1207 		KASSERT(cpu_apic_ids[cpu] != -1,
1208 		    ("IPI to non-existent CPU %d", cpu));
1209 
1210 		if (bitmap) {
1211 			do {
1212 				old_pending = cpu_ipi_pending[cpu];
1213 				new_pending = old_pending | bitmap;
1214 			} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1215 
1216 			if (old_pending)
1217 				continue;
1218 		}
1219 
1220 		lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1221 	}
1222 
1223 }
1224 
1225 /*
1226  * send an IPI to all CPUs EXCEPT myself
1227  */
1228 void
1229 ipi_all_but_self(u_int ipi)
1230 {
1231 
1232 	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
1233 		ipi_selected(PCPU_GET(other_cpus), ipi);
1234 		return;
1235 	}
1236 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1237 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1238 }
1239 
1240 #ifdef STOP_NMI
1241 /*
1242  * send NMI IPI to selected CPUs
1243  */
1244 
1245 #define	BEFORE_SPIN	1000000
1246 
1247 void
1248 ipi_nmi_selected(u_int32_t cpus)
1249 {
1250 	int cpu;
1251 	register_t icrlo;
1252 
1253 	icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT
1254 		| APIC_TRIGMOD_EDGE;
1255 
1256 	CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
1257 
1258 	atomic_set_int(&ipi_nmi_pending, cpus);
1259 
1260 	while ((cpu = ffs(cpus)) != 0) {
1261 		cpu--;
1262 		cpus &= ~(1 << cpu);
1263 
1264 		KASSERT(cpu_apic_ids[cpu] != -1,
1265 		    ("IPI NMI to non-existent CPU %d", cpu));
1266 
1267 		/* Wait for an earlier IPI to finish. */
1268 		if (!lapic_ipi_wait(BEFORE_SPIN))
1269 			panic("ipi_nmi_selected: previous IPI has not cleared");
1270 
1271 		lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
1272 	}
1273 }
1274 
1275 int
1276 ipi_nmi_handler(void)
1277 {
1278 	int cpumask = PCPU_GET(cpumask);
1279 
1280 	if (!(ipi_nmi_pending & cpumask))
1281 		return 1;
1282 
1283 	atomic_clear_int(&ipi_nmi_pending, cpumask);
1284 	cpustop_handler();
1285 	return 0;
1286 }
1287 
1288 #endif /* STOP_NMI */
1289 
1290 /*
1291  * Handle an IPI_STOP by saving our current context and spinning until we
1292  * are resumed.
1293  */
1294 void
1295 cpustop_handler(void)
1296 {
1297 	int cpu = PCPU_GET(cpuid);
1298 	int cpumask = PCPU_GET(cpumask);
1299 
1300 	savectx(&stoppcbs[cpu]);
1301 
1302 	/* Indicate that we are stopped */
1303 	atomic_set_int(&stopped_cpus, cpumask);
1304 
1305 	/* Wait for restart */
1306 	while (!(started_cpus & cpumask))
1307 	    ia32_pause();
1308 
1309 	atomic_clear_int(&started_cpus, cpumask);
1310 	atomic_clear_int(&stopped_cpus, cpumask);
1311 
1312 	if (cpu == 0 && cpustop_restartfunc != NULL) {
1313 		cpustop_restartfunc();
1314 		cpustop_restartfunc = NULL;
1315 	}
1316 }
1317 
1318 /*
1319  * This is called once the rest of the system is up and running and we're
1320  * ready to let the AP's out of the pen.
1321  */
1322 static void
1323 release_aps(void *dummy __unused)
1324 {
1325 
1326 	if (mp_ncpus == 1)
1327 		return;
1328 	atomic_store_rel_int(&aps_ready, 1);
1329 	while (smp_started == 0)
1330 		ia32_pause();
1331 }
1332 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1333 
1334 static int
1335 sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
1336 {
1337 	u_int mask;
1338 	int error;
1339 
1340 	mask = hlt_cpus_mask;
1341 	error = sysctl_handle_int(oidp, &mask, 0, req);
1342 	if (error || !req->newptr)
1343 		return (error);
1344 
1345 	if (logical_cpus_mask != 0 &&
1346 	    (mask & logical_cpus_mask) == logical_cpus_mask)
1347 		hlt_logical_cpus = 1;
1348 	else
1349 		hlt_logical_cpus = 0;
1350 
1351 	if (! hyperthreading_allowed)
1352 		mask |= hyperthreading_cpus_mask;
1353 
1354 	if ((mask & all_cpus) == all_cpus)
1355 		mask &= ~(1<<0);
1356 	hlt_cpus_mask = mask;
1357 	return (error);
1358 }
1359 SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1360     0, 0, sysctl_hlt_cpus, "IU",
1361     "Bitmap of CPUs to halt.  101 (binary) will halt CPUs 0 and 2.");
1362 
1363 static int
1364 sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1365 {
1366 	int disable, error;
1367 
1368 	disable = hlt_logical_cpus;
1369 	error = sysctl_handle_int(oidp, &disable, 0, req);
1370 	if (error || !req->newptr)
1371 		return (error);
1372 
1373 	if (disable)
1374 		hlt_cpus_mask |= logical_cpus_mask;
1375 	else
1376 		hlt_cpus_mask &= ~logical_cpus_mask;
1377 
1378 	if (! hyperthreading_allowed)
1379 		hlt_cpus_mask |= hyperthreading_cpus_mask;
1380 
1381 	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1382 		hlt_cpus_mask &= ~(1<<0);
1383 
1384 	hlt_logical_cpus = disable;
1385 	return (error);
1386 }
1387 
1388 static int
1389 sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
1390 {
1391 	int allowed, error;
1392 
1393 	allowed = hyperthreading_allowed;
1394 	error = sysctl_handle_int(oidp, &allowed, 0, req);
1395 	if (error || !req->newptr)
1396 		return (error);
1397 
1398 #ifdef SCHED_ULE
1399 	/*
1400 	 * SCHED_ULE doesn't allow enabling/disabling HT cores at
1401 	 * run time.
1402 	 */
1403 	if (allowed != hyperthreading_allowed)
1404 		return (ENOTSUP);
1405 	return (error);
1406 #endif
1407 
1408 	if (allowed)
1409 		hlt_cpus_mask &= ~hyperthreading_cpus_mask;
1410 	else
1411 		hlt_cpus_mask |= hyperthreading_cpus_mask;
1412 
1413 	if (logical_cpus_mask != 0 &&
1414 	    (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
1415 		hlt_logical_cpus = 1;
1416 	else
1417 		hlt_logical_cpus = 0;
1418 
1419 	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1420 		hlt_cpus_mask &= ~(1<<0);
1421 
1422 	hyperthreading_allowed = allowed;
1423 	return (error);
1424 }
1425 
1426 static void
1427 cpu_hlt_setup(void *dummy __unused)
1428 {
1429 
1430 	if (logical_cpus_mask != 0) {
1431 		TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1432 		    &hlt_logical_cpus);
1433 		sysctl_ctx_init(&logical_cpu_clist);
1434 		SYSCTL_ADD_PROC(&logical_cpu_clist,
1435 		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1436 		    "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1437 		    sysctl_hlt_logical_cpus, "IU", "");
1438 		SYSCTL_ADD_UINT(&logical_cpu_clist,
1439 		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1440 		    "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1441 		    &logical_cpus_mask, 0, "");
1442 
1443 		if (hlt_logical_cpus)
1444 			hlt_cpus_mask |= logical_cpus_mask;
1445 
1446 		/*
1447 		 * If necessary for security purposes, force
1448 		 * hyperthreading off, regardless of the value
1449 		 * of hlt_logical_cpus.
1450 		 */
1451 		if (hyperthreading_cpus_mask) {
1452 			SYSCTL_ADD_PROC(&logical_cpu_clist,
1453 			    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1454 			    "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
1455 			    0, 0, sysctl_hyperthreading_allowed, "IU", "");
1456 			if (! hyperthreading_allowed)
1457 				hlt_cpus_mask |= hyperthreading_cpus_mask;
1458 		}
1459 	}
1460 }
1461 SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1462 
1463 int
1464 mp_grab_cpu_hlt(void)
1465 {
1466 	u_int mask = PCPU_GET(cpumask);
1467 #ifdef MP_WATCHDOG
1468 	u_int cpuid = PCPU_GET(cpuid);
1469 #endif
1470 	int retval;
1471 
1472 #ifdef MP_WATCHDOG
1473 	ap_watchdog(cpuid);
1474 #endif
1475 
1476 	retval = mask & hlt_cpus_mask;
1477 	while (mask & hlt_cpus_mask)
1478 		__asm __volatile("sti; hlt" : : : "memory");
1479 	return (retval);
1480 }
1481 
1482 #ifdef COUNT_IPIS
1483 /*
1484  * Setup interrupt counters for IPI handlers.
1485  */
1486 static void
1487 mp_ipi_intrcnt(void *dummy)
1488 {
1489 	char buf[64];
1490 	int i;
1491 
1492 	for (i = 0; i < mp_maxid; i++) {
1493 		if (CPU_ABSENT(i))
1494 			continue;
1495 		snprintf(buf, sizeof(buf), "cpu%d: invltlb", i);
1496 		intrcnt_add(buf, &ipi_invltlb_counts[i]);
1497 		snprintf(buf, sizeof(buf), "cpu%d: invlrng", i);
1498 		intrcnt_add(buf, &ipi_invlrng_counts[i]);
1499 		snprintf(buf, sizeof(buf), "cpu%d: invlpg", i);
1500 		intrcnt_add(buf, &ipi_invlpg_counts[i]);
1501 		snprintf(buf, sizeof(buf), "cpu%d: preempt", i);
1502 		intrcnt_add(buf, &ipi_preempt_counts[i]);
1503 		snprintf(buf, sizeof(buf), "cpu%d: ast", i);
1504 		intrcnt_add(buf, &ipi_ast_counts[i]);
1505 		snprintf(buf, sizeof(buf), "cpu%d: rendezvous", i);
1506 		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1507 		snprintf(buf, sizeof(buf), "cpu%d: lazypmap", i);
1508 		intrcnt_add(buf, &ipi_lazypmap_counts[i]);
1509 	}
1510 }
1511 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
1512 #endif
1513