xref: /freebsd/sys/i386/i386/mp_machdep.c (revision e28a4053)
1 /*-
2  * Copyright (c) 1996, by Steve Passe
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. The name of the developer may NOT be used to endorse or promote products
11  *    derived from this software without specific prior written permission.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 #include "opt_apic.h"
30 #include "opt_cpu.h"
31 #include "opt_kstack_pages.h"
32 #include "opt_mp_watchdog.h"
33 #include "opt_pmap.h"
34 #include "opt_sched.h"
35 #include "opt_smp.h"
36 
37 #if !defined(lint)
38 #if !defined(SMP)
39 #error How did you get here?
40 #endif
41 
42 #ifndef DEV_APIC
43 #error The apic device is required for SMP, add "device apic" to your config file.
44 #endif
45 #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
46 #error SMP not supported with CPU_DISABLE_CMPXCHG
47 #endif
48 #endif /* not lint */
49 
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/bus.h>
53 #include <sys/cons.h>	/* cngetc() */
54 #ifdef GPROF
55 #include <sys/gmon.h>
56 #endif
57 #include <sys/kernel.h>
58 #include <sys/ktr.h>
59 #include <sys/lock.h>
60 #include <sys/malloc.h>
61 #include <sys/memrange.h>
62 #include <sys/mutex.h>
63 #include <sys/pcpu.h>
64 #include <sys/proc.h>
65 #include <sys/sched.h>
66 #include <sys/smp.h>
67 #include <sys/sysctl.h>
68 
69 #include <vm/vm.h>
70 #include <vm/vm_param.h>
71 #include <vm/pmap.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_extern.h>
74 
75 #include <x86/apicreg.h>
76 #include <machine/clock.h>
77 #include <machine/cputypes.h>
78 #include <x86/mca.h>
79 #include <machine/md_var.h>
80 #include <machine/mp_watchdog.h>
81 #include <machine/pcb.h>
82 #include <machine/psl.h>
83 #include <machine/smp.h>
84 #include <machine/specialreg.h>
85 
86 #define WARMBOOT_TARGET		0
87 #define WARMBOOT_OFF		(KERNBASE + 0x0467)
88 #define WARMBOOT_SEG		(KERNBASE + 0x0469)
89 
90 #define CMOS_REG		(0x70)
91 #define CMOS_DATA		(0x71)
92 #define BIOS_RESET		(0x0f)
93 #define BIOS_WARM		(0x0a)
94 
95 /*
96  * this code MUST be enabled here and in mpboot.s.
97  * it follows the very early stages of AP boot by placing values in CMOS ram.
98  * it NORMALLY will never be needed and thus the primitive method for enabling.
99  *
100 #define CHECK_POINTS
101  */
102 
103 #if defined(CHECK_POINTS) && !defined(PC98)
104 #define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
105 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
106 
107 #define CHECK_INIT(D);				\
108 	CHECK_WRITE(0x34, (D));			\
109 	CHECK_WRITE(0x35, (D));			\
110 	CHECK_WRITE(0x36, (D));			\
111 	CHECK_WRITE(0x37, (D));			\
112 	CHECK_WRITE(0x38, (D));			\
113 	CHECK_WRITE(0x39, (D));
114 
115 #define CHECK_PRINT(S);				\
116 	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
117 	   (S),					\
118 	   CHECK_READ(0x34),			\
119 	   CHECK_READ(0x35),			\
120 	   CHECK_READ(0x36),			\
121 	   CHECK_READ(0x37),			\
122 	   CHECK_READ(0x38),			\
123 	   CHECK_READ(0x39));
124 
125 #else				/* CHECK_POINTS */
126 
127 #define CHECK_INIT(D)
128 #define CHECK_PRINT(S)
129 #define CHECK_WRITE(A, D)
130 
131 #endif				/* CHECK_POINTS */
132 
133 /* lock region used by kernel profiling */
134 int	mcount_lock;
135 
136 int	mp_naps;		/* # of Applications processors */
137 int	boot_cpu_id = -1;	/* designated BSP */
138 
139 extern	struct pcpu __pcpu[];
140 
141 /* AP uses this during bootstrap.  Do not staticize.  */
142 char *bootSTK;
143 static int bootAP;
144 
145 /* Free these after use */
146 void *bootstacks[MAXCPU];
147 static void *dpcpu;
148 
149 /* Hotwire a 0->4MB V==P mapping */
150 extern pt_entry_t *KPTphys;
151 
152 struct pcb stoppcbs[MAXCPU];
153 
154 /* Variables needed for SMP tlb shootdown. */
155 vm_offset_t smp_tlb_addr1;
156 vm_offset_t smp_tlb_addr2;
157 volatile int smp_tlb_wait;
158 
159 #ifdef COUNT_IPIS
160 /* Interrupt counts. */
161 static u_long *ipi_preempt_counts[MAXCPU];
162 static u_long *ipi_ast_counts[MAXCPU];
163 u_long *ipi_invltlb_counts[MAXCPU];
164 u_long *ipi_invlrng_counts[MAXCPU];
165 u_long *ipi_invlpg_counts[MAXCPU];
166 u_long *ipi_invlcache_counts[MAXCPU];
167 u_long *ipi_rendezvous_counts[MAXCPU];
168 u_long *ipi_lazypmap_counts[MAXCPU];
169 static u_long *ipi_hardclock_counts[MAXCPU];
170 #endif
171 
172 /*
173  * Local data and functions.
174  */
175 
176 static volatile cpumask_t ipi_nmi_pending;
177 
178 /* used to hold the AP's until we are ready to release them */
179 static struct mtx ap_boot_mtx;
180 
181 /* Set to 1 once we're ready to let the APs out of the pen. */
182 static volatile int aps_ready = 0;
183 
184 /*
185  * Store data from cpu_add() until later in the boot when we actually setup
186  * the APs.
187  */
188 struct cpu_info {
189 	int	cpu_present:1;
190 	int	cpu_bsp:1;
191 	int	cpu_disabled:1;
192 	int	cpu_hyperthread:1;
193 } static cpu_info[MAX_APIC_ID + 1];
194 int cpu_apic_ids[MAXCPU];
195 int apic_cpuids[MAX_APIC_ID + 1];
196 
197 /* Holds pending bitmap based IPIs per CPU */
198 static volatile u_int cpu_ipi_pending[MAXCPU];
199 
200 static u_int boot_address;
201 static int cpu_logical;			/* logical cpus per core */
202 static int cpu_cores;			/* cores per package */
203 
204 static void	assign_cpu_ids(void);
205 static void	install_ap_tramp(void);
206 static void	set_interrupt_apic_ids(void);
207 static int	start_all_aps(void);
208 static int	start_ap(int apic_id);
209 static void	release_aps(void *dummy);
210 
211 static int	hlt_logical_cpus;
212 static u_int	hyperthreading_cpus;	/* logical cpus sharing L1 cache */
213 static cpumask_t	hyperthreading_cpus_mask;
214 static int	hyperthreading_allowed = 1;
215 static struct	sysctl_ctx_list logical_cpu_clist;
216 
217 static void
218 mem_range_AP_init(void)
219 {
220 	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
221 		mem_range_softc.mr_op->initAP(&mem_range_softc);
222 }
223 
224 static void
225 topo_probe_amd(void)
226 {
227 
228 	/* AMD processors do not support HTT. */
229 	cpu_cores = (amd_feature2 & AMDID2_CMP) != 0 ?
230 	    (cpu_procinfo2 & AMDID_CMP_CORES) + 1 : 1;
231 	cpu_logical = 1;
232 }
233 
234 /*
235  * Round up to the next power of two, if necessary, and then
236  * take log2.
237  * Returns -1 if argument is zero.
238  */
239 static __inline int
240 mask_width(u_int x)
241 {
242 
243 	return (fls(x << (1 - powerof2(x))) - 1);
244 }
245 
246 static void
247 topo_probe_0x4(void)
248 {
249 	u_int p[4];
250 	int pkg_id_bits;
251 	int core_id_bits;
252 	int max_cores;
253 	int max_logical;
254 	int id;
255 
256 	/* Both zero and one here mean one logical processor per package. */
257 	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
258 	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
259 	if (max_logical <= 1)
260 		return;
261 
262 	/*
263 	 * Because of uniformity assumption we examine only
264 	 * those logical processors that belong to the same
265 	 * package as BSP.  Further, we count number of
266 	 * logical processors that belong to the same core
267 	 * as BSP thus deducing number of threads per core.
268 	 */
269 	cpuid_count(0x04, 0, p);
270 	max_cores = ((p[0] >> 26) & 0x3f) + 1;
271 	core_id_bits = mask_width(max_logical/max_cores);
272 	if (core_id_bits < 0)
273 		return;
274 	pkg_id_bits = core_id_bits + mask_width(max_cores);
275 
276 	for (id = 0; id <= MAX_APIC_ID; id++) {
277 		/* Check logical CPU availability. */
278 		if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
279 			continue;
280 		/* Check if logical CPU has the same package ID. */
281 		if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits))
282 			continue;
283 		cpu_cores++;
284 		/* Check if logical CPU has the same package and core IDs. */
285 		if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits))
286 			cpu_logical++;
287 	}
288 
289 	cpu_cores /= cpu_logical;
290 	hyperthreading_cpus = cpu_logical;
291 }
292 
293 static void
294 topo_probe_0xb(void)
295 {
296 	u_int p[4];
297 	int bits;
298 	int cnt;
299 	int i;
300 	int logical;
301 	int type;
302 	int x;
303 
304 	/* We only support three levels for now. */
305 	for (i = 0; i < 3; i++) {
306 		cpuid_count(0x0b, i, p);
307 
308 		/* Fall back if CPU leaf 11 doesn't really exist. */
309 		if (i == 0 && p[1] == 0) {
310 			topo_probe_0x4();
311 			return;
312 		}
313 
314 		bits = p[0] & 0x1f;
315 		logical = p[1] &= 0xffff;
316 		type = (p[2] >> 8) & 0xff;
317 		if (type == 0 || logical == 0)
318 			break;
319 		/*
320 		 * Because of uniformity assumption we examine only
321 		 * those logical processors that belong to the same
322 		 * package as BSP.
323 		 */
324 		for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) {
325 			if (!cpu_info[x].cpu_present ||
326 			    cpu_info[x].cpu_disabled)
327 				continue;
328 			if (x >> bits == boot_cpu_id >> bits)
329 				cnt++;
330 		}
331 		if (type == CPUID_TYPE_SMT)
332 			cpu_logical = cnt;
333 		else if (type == CPUID_TYPE_CORE)
334 			cpu_cores = cnt;
335 	}
336 	if (cpu_logical == 0)
337 		cpu_logical = 1;
338 	cpu_cores /= cpu_logical;
339 }
340 
341 /*
342  * Both topology discovery code and code that consumes topology
343  * information assume top-down uniformity of the topology.
344  * That is, all physical packages must be identical and each
345  * core in a package must have the same number of threads.
346  * Topology information is queried only on BSP, on which this
347  * code runs and for which it can query CPUID information.
348  * Then topology is extrapolated on all packages using the
349  * uniformity assumption.
350  */
351 static void
352 topo_probe(void)
353 {
354 	static int cpu_topo_probed = 0;
355 
356 	if (cpu_topo_probed)
357 		return;
358 
359 	logical_cpus_mask = 0;
360 	if (cpu_vendor_id == CPU_VENDOR_AMD)
361 		topo_probe_amd();
362 	else if (cpu_vendor_id == CPU_VENDOR_INTEL) {
363 		/*
364 		 * See Intel(R) 64 Architecture Processor
365 		 * Topology Enumeration article for details.
366 		 *
367 		 * Note that 0x1 <= cpu_high < 4 case should be
368 		 * compatible with topo_probe_0x4() logic when
369 		 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
370 		 * or it should trigger the fallback otherwise.
371 		 */
372 		if (cpu_high >= 0xb)
373 			topo_probe_0xb();
374 		else if (cpu_high >= 0x1)
375 			topo_probe_0x4();
376 	}
377 
378 	/*
379 	 * Fallback: assume each logical CPU is in separate
380 	 * physical package.  That is, no multi-core, no SMT.
381 	 */
382 	if (cpu_cores == 0)
383 		cpu_cores = 1;
384 	if (cpu_logical == 0)
385 		cpu_logical = 1;
386 	cpu_topo_probed = 1;
387 }
388 
389 struct cpu_group *
390 cpu_topo(void)
391 {
392 	int cg_flags;
393 
394 	/*
395 	 * Determine whether any threading flags are
396 	 * necessry.
397 	 */
398 	topo_probe();
399 	if (cpu_logical > 1 && hyperthreading_cpus)
400 		cg_flags = CG_FLAG_HTT;
401 	else if (cpu_logical > 1)
402 		cg_flags = CG_FLAG_SMT;
403 	else
404 		cg_flags = 0;
405 	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
406 		printf("WARNING: Non-uniform processors.\n");
407 		printf("WARNING: Using suboptimal topology.\n");
408 		return (smp_topo_none());
409 	}
410 	/*
411 	 * No multi-core or hyper-threaded.
412 	 */
413 	if (cpu_logical * cpu_cores == 1)
414 		return (smp_topo_none());
415 	/*
416 	 * Only HTT no multi-core.
417 	 */
418 	if (cpu_logical > 1 && cpu_cores == 1)
419 		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags));
420 	/*
421 	 * Only multi-core no HTT.
422 	 */
423 	if (cpu_cores > 1 && cpu_logical == 1)
424 		return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags));
425 	/*
426 	 * Both HTT and multi-core.
427 	 */
428 	return (smp_topo_2level(CG_SHARE_L2, cpu_cores,
429 	    CG_SHARE_L1, cpu_logical, cg_flags));
430 }
431 
432 
433 /*
434  * Calculate usable address in base memory for AP trampoline code.
435  */
436 u_int
437 mp_bootaddress(u_int basemem)
438 {
439 
440 	boot_address = trunc_page(basemem);	/* round down to 4k boundary */
441 	if ((basemem - boot_address) < bootMP_size)
442 		boot_address -= PAGE_SIZE;	/* not enough, lower by 4k */
443 
444 	return boot_address;
445 }
446 
447 void
448 cpu_add(u_int apic_id, char boot_cpu)
449 {
450 
451 	if (apic_id > MAX_APIC_ID) {
452 		panic("SMP: APIC ID %d too high", apic_id);
453 		return;
454 	}
455 	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
456 	    apic_id));
457 	cpu_info[apic_id].cpu_present = 1;
458 	if (boot_cpu) {
459 		KASSERT(boot_cpu_id == -1,
460 		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
461 		    boot_cpu_id));
462 		boot_cpu_id = apic_id;
463 		cpu_info[apic_id].cpu_bsp = 1;
464 	}
465 	if (mp_ncpus < MAXCPU)
466 		mp_ncpus++;
467 	if (bootverbose)
468 		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
469 		    "AP");
470 }
471 
472 void
473 cpu_mp_setmaxid(void)
474 {
475 
476 	mp_maxid = MAXCPU - 1;
477 }
478 
479 int
480 cpu_mp_probe(void)
481 {
482 
483 	/*
484 	 * Always record BSP in CPU map so that the mbuf init code works
485 	 * correctly.
486 	 */
487 	all_cpus = 1;
488 	if (mp_ncpus == 0) {
489 		/*
490 		 * No CPUs were found, so this must be a UP system.  Setup
491 		 * the variables to represent a system with a single CPU
492 		 * with an id of 0.
493 		 */
494 		mp_ncpus = 1;
495 		return (0);
496 	}
497 
498 	/* At least one CPU was found. */
499 	if (mp_ncpus == 1) {
500 		/*
501 		 * One CPU was found, so this must be a UP system with
502 		 * an I/O APIC.
503 		 */
504 		return (0);
505 	}
506 
507 	/* At least two CPUs were found. */
508 	return (1);
509 }
510 
511 /*
512  * Initialize the IPI handlers and start up the AP's.
513  */
514 void
515 cpu_mp_start(void)
516 {
517 	int i;
518 
519 	/* Initialize the logical ID to APIC ID table. */
520 	for (i = 0; i < MAXCPU; i++) {
521 		cpu_apic_ids[i] = -1;
522 		cpu_ipi_pending[i] = 0;
523 	}
524 
525 	/* Install an inter-CPU IPI for TLB invalidation */
526 	setidt(IPI_INVLTLB, IDTVEC(invltlb),
527 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
528 	setidt(IPI_INVLPG, IDTVEC(invlpg),
529 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
530 	setidt(IPI_INVLRNG, IDTVEC(invlrng),
531 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
532 
533 	/* Install an inter-CPU IPI for cache invalidation. */
534 	setidt(IPI_INVLCACHE, IDTVEC(invlcache),
535 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
536 
537 	/* Install an inter-CPU IPI for lazy pmap release */
538 	setidt(IPI_LAZYPMAP, IDTVEC(lazypmap),
539 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
540 
541 	/* Install an inter-CPU IPI for all-CPU rendezvous */
542 	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
543 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
544 
545 	/* Install generic inter-CPU IPI handler */
546 	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
547 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
548 
549 	/* Install an inter-CPU IPI for CPU stop/restart */
550 	setidt(IPI_STOP, IDTVEC(cpustop),
551 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
552 
553 
554 	/* Set boot_cpu_id if needed. */
555 	if (boot_cpu_id == -1) {
556 		boot_cpu_id = PCPU_GET(apic_id);
557 		cpu_info[boot_cpu_id].cpu_bsp = 1;
558 	} else
559 		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
560 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
561 
562 	/* Probe logical/physical core configuration. */
563 	topo_probe();
564 
565 	assign_cpu_ids();
566 
567 	/* Start each Application Processor */
568 	start_all_aps();
569 
570 	set_interrupt_apic_ids();
571 }
572 
573 
574 /*
575  * Print various information about the SMP system hardware and setup.
576  */
577 void
578 cpu_mp_announce(void)
579 {
580 	const char *hyperthread;
581 	int i;
582 
583 	printf("FreeBSD/SMP: %d package(s) x %d core(s)",
584 	    mp_ncpus / (cpu_cores * cpu_logical), cpu_cores);
585 	if (hyperthreading_cpus > 1)
586 	    printf(" x %d HTT threads", cpu_logical);
587 	else if (cpu_logical > 1)
588 	    printf(" x %d SMT threads", cpu_logical);
589 	printf("\n");
590 
591 	/* List active CPUs first. */
592 	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
593 	for (i = 1; i < mp_ncpus; i++) {
594 		if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread)
595 			hyperthread = "/HT";
596 		else
597 			hyperthread = "";
598 		printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread,
599 		    cpu_apic_ids[i]);
600 	}
601 
602 	/* List disabled CPUs last. */
603 	for (i = 0; i <= MAX_APIC_ID; i++) {
604 		if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled)
605 			continue;
606 		if (cpu_info[i].cpu_hyperthread)
607 			hyperthread = "/HT";
608 		else
609 			hyperthread = "";
610 		printf("  cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread,
611 		    i);
612 	}
613 }
614 
615 /*
616  * AP CPU's call this to initialize themselves.
617  */
618 void
619 init_secondary(void)
620 {
621 	struct pcpu *pc;
622 	vm_offset_t addr;
623 	int	gsel_tss;
624 	int	x, myid;
625 	u_int	cr0;
626 
627 	/* bootAP is set in start_ap() to our ID. */
628 	myid = bootAP;
629 
630 	/* Get per-cpu data */
631 	pc = &__pcpu[myid];
632 
633 	/* prime data page for it to use */
634 	pcpu_init(pc, myid, sizeof(struct pcpu));
635 	dpcpu_init(dpcpu, myid);
636 	pc->pc_apic_id = cpu_apic_ids[myid];
637 	pc->pc_prvspace = pc;
638 	pc->pc_curthread = 0;
639 
640 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
641 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
642 
643 	for (x = 0; x < NGDT; x++) {
644 		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
645 	}
646 
647 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
648 	r_gdt.rd_base = (int) &gdt[myid * NGDT];
649 	lgdt(&r_gdt);			/* does magic intra-segment return */
650 
651 	lidt(&r_idt);
652 
653 	lldt(_default_ldt);
654 	PCPU_SET(currentldt, _default_ldt);
655 
656 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
657 	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
658 	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
659 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
660 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
661 	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
662 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
663 	ltr(gsel_tss);
664 
665 	PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd);
666 
667 	/*
668 	 * Set to a known state:
669 	 * Set by mpboot.s: CR0_PG, CR0_PE
670 	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
671 	 */
672 	cr0 = rcr0();
673 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
674 	load_cr0(cr0);
675 	CHECK_WRITE(0x38, 5);
676 
677 	/* Disable local APIC just to be sure. */
678 	lapic_disable();
679 
680 	/* signal our startup to the BSP. */
681 	mp_naps++;
682 	CHECK_WRITE(0x39, 6);
683 
684 	/* Spin until the BSP releases the AP's. */
685 	while (!aps_ready)
686 		ia32_pause();
687 
688 	/* BSP may have changed PTD while we were waiting */
689 	invltlb();
690 	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
691 		invlpg(addr);
692 
693 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
694 	lidt(&r_idt);
695 #endif
696 
697 	/* Initialize the PAT MSR if present. */
698 	pmap_init_pat();
699 
700 	/* set up CPU registers and state */
701 	cpu_setregs();
702 
703 	/* set up FPU state on the AP */
704 	npxinit();
705 
706 	/* set up SSE registers */
707 	enable_sse();
708 
709 #ifdef PAE
710 	/* Enable the PTE no-execute bit. */
711 	if ((amd_feature & AMDID_NX) != 0) {
712 		uint64_t msr;
713 
714 		msr = rdmsr(MSR_EFER) | EFER_NXE;
715 		wrmsr(MSR_EFER, msr);
716 	}
717 #endif
718 
719 	/* A quick check from sanity claus */
720 	if (PCPU_GET(apic_id) != lapic_id()) {
721 		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
722 		printf("SMP: actual apic_id = %d\n", lapic_id());
723 		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
724 		panic("cpuid mismatch! boom!!");
725 	}
726 
727 	/* Initialize curthread. */
728 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
729 	PCPU_SET(curthread, PCPU_GET(idlethread));
730 
731 	mca_init();
732 
733 	mtx_lock_spin(&ap_boot_mtx);
734 
735 	/* Init local apic for irq's */
736 	lapic_setup(1);
737 
738 	/* Set memory range attributes for this CPU to match the BSP */
739 	mem_range_AP_init();
740 
741 	smp_cpus++;
742 
743 	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
744 	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
745 
746 	/* Determine if we are a logical CPU. */
747 	/* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */
748 	if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0)
749 		logical_cpus_mask |= PCPU_GET(cpumask);
750 
751 	/* Determine if we are a hyperthread. */
752 	if (hyperthreading_cpus > 1 &&
753 	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
754 		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
755 
756 	/* Build our map of 'other' CPUs. */
757 	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
758 
759 	if (bootverbose)
760 		lapic_dump("AP");
761 
762 	if (smp_cpus == mp_ncpus) {
763 		/* enable IPI's, tlb shootdown, freezes etc */
764 		atomic_store_rel_int(&smp_started, 1);
765 		smp_active = 1;	 /* historic */
766 	}
767 
768 	mtx_unlock_spin(&ap_boot_mtx);
769 
770 	/* Wait until all the AP's are up. */
771 	while (smp_started == 0)
772 		ia32_pause();
773 
774 	/* Start per-CPU event timers. */
775 	cpu_initclocks_ap();
776 
777 	/* Enter the scheduler. */
778 	sched_throw(NULL);
779 
780 	panic("scheduler returned us to %s", __func__);
781 	/* NOTREACHED */
782 }
783 
784 /*******************************************************************
785  * local functions and data
786  */
787 
788 /*
789  * We tell the I/O APIC code about all the CPUs we want to receive
790  * interrupts.  If we don't want certain CPUs to receive IRQs we
791  * can simply not tell the I/O APIC code about them in this function.
792  * We also do not tell it about the BSP since it tells itself about
793  * the BSP internally to work with UP kernels and on UP machines.
794  */
795 static void
796 set_interrupt_apic_ids(void)
797 {
798 	u_int i, apic_id;
799 
800 	for (i = 0; i < MAXCPU; i++) {
801 		apic_id = cpu_apic_ids[i];
802 		if (apic_id == -1)
803 			continue;
804 		if (cpu_info[apic_id].cpu_bsp)
805 			continue;
806 		if (cpu_info[apic_id].cpu_disabled)
807 			continue;
808 
809 		/* Don't let hyperthreads service interrupts. */
810 		if (hyperthreading_cpus > 1 &&
811 		    apic_id % hyperthreading_cpus != 0)
812 			continue;
813 
814 		intr_add_cpu(i);
815 	}
816 }
817 
818 /*
819  * Assign logical CPU IDs to local APICs.
820  */
821 static void
822 assign_cpu_ids(void)
823 {
824 	u_int i;
825 
826 	TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
827 	    &hyperthreading_allowed);
828 
829 	/* Check for explicitly disabled CPUs. */
830 	for (i = 0; i <= MAX_APIC_ID; i++) {
831 		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
832 			continue;
833 
834 		if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) {
835 			cpu_info[i].cpu_hyperthread = 1;
836 #if defined(SCHED_ULE)
837 			/*
838 			 * Don't use HT CPU if it has been disabled by a
839 			 * tunable.
840 			 */
841 			if (hyperthreading_allowed == 0) {
842 				cpu_info[i].cpu_disabled = 1;
843 				continue;
844 			}
845 #endif
846 		}
847 
848 		/* Don't use this CPU if it has been disabled by a tunable. */
849 		if (resource_disabled("lapic", i)) {
850 			cpu_info[i].cpu_disabled = 1;
851 			continue;
852 		}
853 	}
854 
855 	/*
856 	 * Assign CPU IDs to local APIC IDs and disable any CPUs
857 	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
858 	 *
859 	 * To minimize confusion for userland, we attempt to number
860 	 * CPUs such that all threads and cores in a package are
861 	 * grouped together.  For now we assume that the BSP is always
862 	 * the first thread in a package and just start adding APs
863 	 * starting with the BSP's APIC ID.
864 	 */
865 	mp_ncpus = 1;
866 	cpu_apic_ids[0] = boot_cpu_id;
867 	apic_cpuids[boot_cpu_id] = 0;
868 	for (i = boot_cpu_id + 1; i != boot_cpu_id;
869 	     i == MAX_APIC_ID ? i = 0 : i++) {
870 		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
871 		    cpu_info[i].cpu_disabled)
872 			continue;
873 
874 		if (mp_ncpus < MAXCPU) {
875 			cpu_apic_ids[mp_ncpus] = i;
876 			apic_cpuids[i] = mp_ncpus;
877 			mp_ncpus++;
878 		} else
879 			cpu_info[i].cpu_disabled = 1;
880 	}
881 	KASSERT(mp_maxid >= mp_ncpus - 1,
882 	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
883 	    mp_ncpus));
884 }
885 
886 /*
887  * start each AP in our list
888  */
889 /* Lowest 1MB is already mapped: don't touch*/
890 #define TMPMAP_START 1
891 static int
892 start_all_aps(void)
893 {
894 #ifndef PC98
895 	u_char mpbiosreason;
896 #endif
897 	uintptr_t kptbase;
898 	u_int32_t mpbioswarmvec;
899 	int apic_id, cpu, i;
900 
901 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
902 
903 	/* install the AP 1st level boot code */
904 	install_ap_tramp();
905 
906 	/* save the current value of the warm-start vector */
907 	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
908 #ifndef PC98
909 	outb(CMOS_REG, BIOS_RESET);
910 	mpbiosreason = inb(CMOS_DATA);
911 #endif
912 
913 	/* set up temporary P==V mapping for AP boot */
914 	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
915 
916 	kptbase = (uintptr_t)(void *)KPTphys;
917 	for (i = TMPMAP_START; i < NKPT; i++)
918 		PTD[i] = (pd_entry_t)(PG_V | PG_RW |
919 		    ((kptbase + i * PAGE_SIZE) & PG_FRAME));
920 	invltlb();
921 
922 	/* start each AP */
923 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
924 		apic_id = cpu_apic_ids[cpu];
925 
926 		/* allocate and set up a boot stack data page */
927 		bootstacks[cpu] =
928 		    (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
929 		dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE);
930 		/* setup a vector to our boot code */
931 		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
932 		*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
933 #ifndef PC98
934 		outb(CMOS_REG, BIOS_RESET);
935 		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
936 #endif
937 
938 		bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 4;
939 		bootAP = cpu;
940 
941 		/* attempt to start the Application Processor */
942 		CHECK_INIT(99);	/* setup checkpoints */
943 		if (!start_ap(apic_id)) {
944 			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
945 			CHECK_PRINT("trace");	/* show checkpoints */
946 			/* better panic as the AP may be running loose */
947 			printf("panic y/n? [y] ");
948 			if (cngetc() != 'n')
949 				panic("bye-bye");
950 		}
951 		CHECK_PRINT("trace");		/* show checkpoints */
952 
953 		all_cpus |= (1 << cpu);		/* record AP in CPU map */
954 	}
955 
956 	/* build our map of 'other' CPUs */
957 	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
958 
959 	/* restore the warmstart vector */
960 	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
961 
962 #ifndef PC98
963 	outb(CMOS_REG, BIOS_RESET);
964 	outb(CMOS_DATA, mpbiosreason);
965 #endif
966 
967 	/* Undo V==P hack from above */
968 	for (i = TMPMAP_START; i < NKPT; i++)
969 		PTD[i] = 0;
970 	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
971 
972 	/* number of APs actually started */
973 	return mp_naps;
974 }
975 
976 /*
977  * load the 1st level AP boot code into base memory.
978  */
979 
980 /* targets for relocation */
981 extern void bigJump(void);
982 extern void bootCodeSeg(void);
983 extern void bootDataSeg(void);
984 extern void MPentry(void);
985 extern u_int MP_GDT;
986 extern u_int mp_gdtbase;
987 
988 static void
989 install_ap_tramp(void)
990 {
991 	int     x;
992 	int     size = *(int *) ((u_long) & bootMP_size);
993 	vm_offset_t va = boot_address + KERNBASE;
994 	u_char *src = (u_char *) ((u_long) bootMP);
995 	u_char *dst = (u_char *) va;
996 	u_int   boot_base = (u_int) bootMP;
997 	u_int8_t *dst8;
998 	u_int16_t *dst16;
999 	u_int32_t *dst32;
1000 
1001 	KASSERT (size <= PAGE_SIZE,
1002 	    ("'size' do not fit into PAGE_SIZE, as expected."));
1003 	pmap_kenter(va, boot_address);
1004 	pmap_invalidate_page (kernel_pmap, va);
1005 	for (x = 0; x < size; ++x)
1006 		*dst++ = *src++;
1007 
1008 	/*
1009 	 * modify addresses in code we just moved to basemem. unfortunately we
1010 	 * need fairly detailed info about mpboot.s for this to work.  changes
1011 	 * to mpboot.s might require changes here.
1012 	 */
1013 
1014 	/* boot code is located in KERNEL space */
1015 	dst = (u_char *) va;
1016 
1017 	/* modify the lgdt arg */
1018 	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
1019 	*dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
1020 
1021 	/* modify the ljmp target for MPentry() */
1022 	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
1023 	*dst32 = ((u_int) MPentry - KERNBASE);
1024 
1025 	/* modify the target for boot code segment */
1026 	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
1027 	dst8 = (u_int8_t *) (dst16 + 1);
1028 	*dst16 = (u_int) boot_address & 0xffff;
1029 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
1030 
1031 	/* modify the target for boot data segment */
1032 	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
1033 	dst8 = (u_int8_t *) (dst16 + 1);
1034 	*dst16 = (u_int) boot_address & 0xffff;
1035 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
1036 }
1037 
1038 /*
1039  * This function starts the AP (application processor) identified
1040  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
1041  * to accomplish this.  This is necessary because of the nuances
1042  * of the different hardware we might encounter.  It isn't pretty,
1043  * but it seems to work.
1044  */
1045 static int
1046 start_ap(int apic_id)
1047 {
1048 	int vector, ms;
1049 	int cpus;
1050 
1051 	/* calculate the vector */
1052 	vector = (boot_address >> 12) & 0xff;
1053 
1054 	/* used as a watchpoint to signal AP startup */
1055 	cpus = mp_naps;
1056 
1057 	/*
1058 	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
1059 	 * and running the target CPU. OR this INIT IPI might be latched (P5
1060 	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
1061 	 * ignored.
1062 	 */
1063 
1064 	/* do an INIT IPI: assert RESET */
1065 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1066 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
1067 
1068 	/* wait for pending status end */
1069 	lapic_ipi_wait(-1);
1070 
1071 	/* do an INIT IPI: deassert RESET */
1072 	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
1073 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
1074 
1075 	/* wait for pending status end */
1076 	DELAY(10000);		/* wait ~10mS */
1077 	lapic_ipi_wait(-1);
1078 
1079 	/*
1080 	 * next we do a STARTUP IPI: the previous INIT IPI might still be
1081 	 * latched, (P5 bug) this 1st STARTUP would then terminate
1082 	 * immediately, and the previously started INIT IPI would continue. OR
1083 	 * the previous INIT IPI has already run. and this STARTUP IPI will
1084 	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
1085 	 * will run.
1086 	 */
1087 
1088 	/* do a STARTUP IPI */
1089 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1090 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1091 	    vector, apic_id);
1092 	lapic_ipi_wait(-1);
1093 	DELAY(200);		/* wait ~200uS */
1094 
1095 	/*
1096 	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
1097 	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
1098 	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
1099 	 * recognized after hardware RESET or INIT IPI.
1100 	 */
1101 
1102 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1103 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1104 	    vector, apic_id);
1105 	lapic_ipi_wait(-1);
1106 	DELAY(200);		/* wait ~200uS */
1107 
1108 	/* Wait up to 5 seconds for it to start. */
1109 	for (ms = 0; ms < 5000; ms++) {
1110 		if (mp_naps > cpus)
1111 			return 1;	/* return SUCCESS */
1112 		DELAY(1000);
1113 	}
1114 	return 0;		/* return FAILURE */
1115 }
1116 
1117 #ifdef COUNT_XINVLTLB_HITS
1118 u_int xhits_gbl[MAXCPU];
1119 u_int xhits_pg[MAXCPU];
1120 u_int xhits_rng[MAXCPU];
1121 SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
1122 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
1123     sizeof(xhits_gbl), "IU", "");
1124 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
1125     sizeof(xhits_pg), "IU", "");
1126 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
1127     sizeof(xhits_rng), "IU", "");
1128 
1129 u_int ipi_global;
1130 u_int ipi_page;
1131 u_int ipi_range;
1132 u_int ipi_range_size;
1133 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
1134 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
1135 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
1136 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
1137     0, "");
1138 
1139 u_int ipi_masked_global;
1140 u_int ipi_masked_page;
1141 u_int ipi_masked_range;
1142 u_int ipi_masked_range_size;
1143 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
1144     &ipi_masked_global, 0, "");
1145 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
1146     &ipi_masked_page, 0, "");
1147 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
1148     &ipi_masked_range, 0, "");
1149 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
1150     &ipi_masked_range_size, 0, "");
1151 #endif /* COUNT_XINVLTLB_HITS */
1152 
1153 /*
1154  * Flush the TLB on all other CPU's
1155  */
1156 static void
1157 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1158 {
1159 	u_int ncpu;
1160 
1161 	ncpu = mp_ncpus - 1;	/* does not shootdown self */
1162 	if (ncpu < 1)
1163 		return;		/* no other cpus */
1164 	if (!(read_eflags() & PSL_I))
1165 		panic("%s: interrupts disabled", __func__);
1166 	mtx_lock_spin(&smp_ipi_mtx);
1167 	smp_tlb_addr1 = addr1;
1168 	smp_tlb_addr2 = addr2;
1169 	atomic_store_rel_int(&smp_tlb_wait, 0);
1170 	ipi_all_but_self(vector);
1171 	while (smp_tlb_wait < ncpu)
1172 		ia32_pause();
1173 	mtx_unlock_spin(&smp_ipi_mtx);
1174 }
1175 
1176 static void
1177 smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1178 {
1179 	int ncpu, othercpus;
1180 
1181 	othercpus = mp_ncpus - 1;
1182 	if (mask == (u_int)-1) {
1183 		ncpu = othercpus;
1184 		if (ncpu < 1)
1185 			return;
1186 	} else {
1187 		mask &= ~PCPU_GET(cpumask);
1188 		if (mask == 0)
1189 			return;
1190 		ncpu = bitcount32(mask);
1191 		if (ncpu > othercpus) {
1192 			/* XXX this should be a panic offence */
1193 			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1194 			    ncpu, othercpus);
1195 			ncpu = othercpus;
1196 		}
1197 		/* XXX should be a panic, implied by mask == 0 above */
1198 		if (ncpu < 1)
1199 			return;
1200 	}
1201 	if (!(read_eflags() & PSL_I))
1202 		panic("%s: interrupts disabled", __func__);
1203 	mtx_lock_spin(&smp_ipi_mtx);
1204 	smp_tlb_addr1 = addr1;
1205 	smp_tlb_addr2 = addr2;
1206 	atomic_store_rel_int(&smp_tlb_wait, 0);
1207 	if (mask == (u_int)-1)
1208 		ipi_all_but_self(vector);
1209 	else
1210 		ipi_selected(mask, vector);
1211 	while (smp_tlb_wait < ncpu)
1212 		ia32_pause();
1213 	mtx_unlock_spin(&smp_ipi_mtx);
1214 }
1215 
1216 /*
1217  * Send an IPI to specified CPU handling the bitmap logic.
1218  */
1219 static void
1220 ipi_send_cpu(int cpu, u_int ipi)
1221 {
1222 	u_int bitmap, old_pending, new_pending;
1223 
1224 	KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
1225 
1226 	if (IPI_IS_BITMAPED(ipi)) {
1227 		bitmap = 1 << ipi;
1228 		ipi = IPI_BITMAP_VECTOR;
1229 		do {
1230 			old_pending = cpu_ipi_pending[cpu];
1231 			new_pending = old_pending | bitmap;
1232 		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
1233 		    old_pending, new_pending));
1234 		if (old_pending)
1235 			return;
1236 	}
1237 	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1238 }
1239 
1240 void
1241 smp_cache_flush(void)
1242 {
1243 
1244 	if (smp_started)
1245 		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1246 }
1247 
1248 void
1249 smp_invltlb(void)
1250 {
1251 
1252 	if (smp_started) {
1253 		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1254 #ifdef COUNT_XINVLTLB_HITS
1255 		ipi_global++;
1256 #endif
1257 	}
1258 }
1259 
1260 void
1261 smp_invlpg(vm_offset_t addr)
1262 {
1263 
1264 	if (smp_started) {
1265 		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1266 #ifdef COUNT_XINVLTLB_HITS
1267 		ipi_page++;
1268 #endif
1269 	}
1270 }
1271 
1272 void
1273 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1274 {
1275 
1276 	if (smp_started) {
1277 		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1278 #ifdef COUNT_XINVLTLB_HITS
1279 		ipi_range++;
1280 		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
1281 #endif
1282 	}
1283 }
1284 
1285 void
1286 smp_masked_invltlb(cpumask_t mask)
1287 {
1288 
1289 	if (smp_started) {
1290 		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1291 #ifdef COUNT_XINVLTLB_HITS
1292 		ipi_masked_global++;
1293 #endif
1294 	}
1295 }
1296 
1297 void
1298 smp_masked_invlpg(cpumask_t mask, vm_offset_t addr)
1299 {
1300 
1301 	if (smp_started) {
1302 		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1303 #ifdef COUNT_XINVLTLB_HITS
1304 		ipi_masked_page++;
1305 #endif
1306 	}
1307 }
1308 
1309 void
1310 smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2)
1311 {
1312 
1313 	if (smp_started) {
1314 		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1315 #ifdef COUNT_XINVLTLB_HITS
1316 		ipi_masked_range++;
1317 		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
1318 #endif
1319 	}
1320 }
1321 
1322 void
1323 ipi_bitmap_handler(struct trapframe frame)
1324 {
1325 	struct trapframe *oldframe;
1326 	struct thread *td;
1327 	int cpu = PCPU_GET(cpuid);
1328 	u_int ipi_bitmap;
1329 
1330 	critical_enter();
1331 	td = curthread;
1332 	td->td_intr_nesting_level++;
1333 	oldframe = td->td_intr_frame;
1334 	td->td_intr_frame = &frame;
1335 	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1336 	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1337 #ifdef COUNT_IPIS
1338 		(*ipi_preempt_counts[cpu])++;
1339 #endif
1340 		sched_preempt(td);
1341 	}
1342 	if (ipi_bitmap & (1 << IPI_AST)) {
1343 #ifdef COUNT_IPIS
1344 		(*ipi_ast_counts[cpu])++;
1345 #endif
1346 		/* Nothing to do for AST */
1347 	}
1348 	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
1349 #ifdef COUNT_IPIS
1350 		(*ipi_hardclock_counts[cpu])++;
1351 #endif
1352 		hardclockintr();
1353 	}
1354 	td->td_intr_frame = oldframe;
1355 	td->td_intr_nesting_level--;
1356 	critical_exit();
1357 }
1358 
1359 /*
1360  * send an IPI to a set of cpus.
1361  */
1362 void
1363 ipi_selected(cpumask_t cpus, u_int ipi)
1364 {
1365 	int cpu;
1366 
1367 	/*
1368 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1369 	 * of help in order to understand what is the source.
1370 	 * Set the mask of receiving CPUs for this purpose.
1371 	 */
1372 	if (ipi == IPI_STOP_HARD)
1373 		atomic_set_int(&ipi_nmi_pending, cpus);
1374 
1375 	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1376 	while ((cpu = ffs(cpus)) != 0) {
1377 		cpu--;
1378 		cpus &= ~(1 << cpu);
1379 		ipi_send_cpu(cpu, ipi);
1380 	}
1381 }
1382 
1383 /*
1384  * send an IPI to a specific CPU.
1385  */
1386 void
1387 ipi_cpu(int cpu, u_int ipi)
1388 {
1389 
1390 	/*
1391 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1392 	 * of help in order to understand what is the source.
1393 	 * Set the mask of receiving CPUs for this purpose.
1394 	 */
1395 	if (ipi == IPI_STOP_HARD)
1396 		atomic_set_int(&ipi_nmi_pending, 1 << cpu);
1397 
1398 	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1399 	ipi_send_cpu(cpu, ipi);
1400 }
1401 
1402 /*
1403  * send an IPI to all CPUs EXCEPT myself
1404  */
1405 void
1406 ipi_all_but_self(u_int ipi)
1407 {
1408 
1409 	if (IPI_IS_BITMAPED(ipi)) {
1410 		ipi_selected(PCPU_GET(other_cpus), ipi);
1411 		return;
1412 	}
1413 
1414 	/*
1415 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1416 	 * of help in order to understand what is the source.
1417 	 * Set the mask of receiving CPUs for this purpose.
1418 	 */
1419 	if (ipi == IPI_STOP_HARD)
1420 		atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus));
1421 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1422 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1423 }
1424 
1425 int
1426 ipi_nmi_handler()
1427 {
1428 	cpumask_t cpumask;
1429 
1430 	/*
1431 	 * As long as there is not a simple way to know about a NMI's
1432 	 * source, if the bitmask for the current CPU is present in
1433 	 * the global pending bitword an IPI_STOP_HARD has been issued
1434 	 * and should be handled.
1435 	 */
1436 	cpumask = PCPU_GET(cpumask);
1437 	if ((ipi_nmi_pending & cpumask) == 0)
1438 		return (1);
1439 
1440 	atomic_clear_int(&ipi_nmi_pending, cpumask);
1441 	cpustop_handler();
1442 	return (0);
1443 }
1444 
1445 /*
1446  * Handle an IPI_STOP by saving our current context and spinning until we
1447  * are resumed.
1448  */
1449 void
1450 cpustop_handler(void)
1451 {
1452 	cpumask_t cpumask;
1453 	u_int cpu;
1454 
1455 	cpu = PCPU_GET(cpuid);
1456 	cpumask = PCPU_GET(cpumask);
1457 
1458 	savectx(&stoppcbs[cpu]);
1459 
1460 	/* Indicate that we are stopped */
1461 	atomic_set_int(&stopped_cpus, cpumask);
1462 
1463 	/* Wait for restart */
1464 	while (!(started_cpus & cpumask))
1465 	    ia32_pause();
1466 
1467 	atomic_clear_int(&started_cpus, cpumask);
1468 	atomic_clear_int(&stopped_cpus, cpumask);
1469 
1470 	if (cpu == 0 && cpustop_restartfunc != NULL) {
1471 		cpustop_restartfunc();
1472 		cpustop_restartfunc = NULL;
1473 	}
1474 }
1475 
1476 /*
1477  * This is called once the rest of the system is up and running and we're
1478  * ready to let the AP's out of the pen.
1479  */
1480 static void
1481 release_aps(void *dummy __unused)
1482 {
1483 
1484 	if (mp_ncpus == 1)
1485 		return;
1486 	atomic_store_rel_int(&aps_ready, 1);
1487 	while (smp_started == 0)
1488 		ia32_pause();
1489 }
1490 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1491 
1492 static int
1493 sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
1494 {
1495 	cpumask_t mask;
1496 	int error;
1497 
1498 	mask = hlt_cpus_mask;
1499 	error = sysctl_handle_int(oidp, &mask, 0, req);
1500 	if (error || !req->newptr)
1501 		return (error);
1502 
1503 	if (logical_cpus_mask != 0 &&
1504 	    (mask & logical_cpus_mask) == logical_cpus_mask)
1505 		hlt_logical_cpus = 1;
1506 	else
1507 		hlt_logical_cpus = 0;
1508 
1509 	if (! hyperthreading_allowed)
1510 		mask |= hyperthreading_cpus_mask;
1511 
1512 	if ((mask & all_cpus) == all_cpus)
1513 		mask &= ~(1<<0);
1514 	hlt_cpus_mask = mask;
1515 	return (error);
1516 }
1517 SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1518     0, 0, sysctl_hlt_cpus, "IU",
1519     "Bitmap of CPUs to halt.  101 (binary) will halt CPUs 0 and 2.");
1520 
1521 static int
1522 sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1523 {
1524 	int disable, error;
1525 
1526 	disable = hlt_logical_cpus;
1527 	error = sysctl_handle_int(oidp, &disable, 0, req);
1528 	if (error || !req->newptr)
1529 		return (error);
1530 
1531 	if (disable)
1532 		hlt_cpus_mask |= logical_cpus_mask;
1533 	else
1534 		hlt_cpus_mask &= ~logical_cpus_mask;
1535 
1536 	if (! hyperthreading_allowed)
1537 		hlt_cpus_mask |= hyperthreading_cpus_mask;
1538 
1539 	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1540 		hlt_cpus_mask &= ~(1<<0);
1541 
1542 	hlt_logical_cpus = disable;
1543 	return (error);
1544 }
1545 
1546 static int
1547 sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
1548 {
1549 	int allowed, error;
1550 
1551 	allowed = hyperthreading_allowed;
1552 	error = sysctl_handle_int(oidp, &allowed, 0, req);
1553 	if (error || !req->newptr)
1554 		return (error);
1555 
1556 #ifdef SCHED_ULE
1557 	/*
1558 	 * SCHED_ULE doesn't allow enabling/disabling HT cores at
1559 	 * run-time.
1560 	 */
1561 	if (allowed != hyperthreading_allowed)
1562 		return (ENOTSUP);
1563 	return (error);
1564 #endif
1565 
1566 	if (allowed)
1567 		hlt_cpus_mask &= ~hyperthreading_cpus_mask;
1568 	else
1569 		hlt_cpus_mask |= hyperthreading_cpus_mask;
1570 
1571 	if (logical_cpus_mask != 0 &&
1572 	    (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
1573 		hlt_logical_cpus = 1;
1574 	else
1575 		hlt_logical_cpus = 0;
1576 
1577 	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1578 		hlt_cpus_mask &= ~(1<<0);
1579 
1580 	hyperthreading_allowed = allowed;
1581 	return (error);
1582 }
1583 
1584 static void
1585 cpu_hlt_setup(void *dummy __unused)
1586 {
1587 
1588 	if (logical_cpus_mask != 0) {
1589 		TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1590 		    &hlt_logical_cpus);
1591 		sysctl_ctx_init(&logical_cpu_clist);
1592 		SYSCTL_ADD_PROC(&logical_cpu_clist,
1593 		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1594 		    "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1595 		    sysctl_hlt_logical_cpus, "IU", "");
1596 		SYSCTL_ADD_UINT(&logical_cpu_clist,
1597 		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1598 		    "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1599 		    &logical_cpus_mask, 0, "");
1600 
1601 		if (hlt_logical_cpus)
1602 			hlt_cpus_mask |= logical_cpus_mask;
1603 
1604 		/*
1605 		 * If necessary for security purposes, force
1606 		 * hyperthreading off, regardless of the value
1607 		 * of hlt_logical_cpus.
1608 		 */
1609 		if (hyperthreading_cpus_mask) {
1610 			SYSCTL_ADD_PROC(&logical_cpu_clist,
1611 			    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1612 			    "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
1613 			    0, 0, sysctl_hyperthreading_allowed, "IU", "");
1614 			if (! hyperthreading_allowed)
1615 				hlt_cpus_mask |= hyperthreading_cpus_mask;
1616 		}
1617 	}
1618 }
1619 SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1620 
1621 int
1622 mp_grab_cpu_hlt(void)
1623 {
1624 	cpumask_t mask;
1625 #ifdef MP_WATCHDOG
1626 	u_int cpuid;
1627 #endif
1628 	int retval;
1629 
1630 	mask = PCPU_GET(cpumask);
1631 #ifdef MP_WATCHDOG
1632 	cpuid = PCPU_GET(cpuid);
1633 	ap_watchdog(cpuid);
1634 #endif
1635 
1636 	retval = 0;
1637 	while (mask & hlt_cpus_mask) {
1638 		retval = 1;
1639 		__asm __volatile("sti; hlt" : : : "memory");
1640 	}
1641 	return (retval);
1642 }
1643 
1644 #ifdef COUNT_IPIS
1645 /*
1646  * Setup interrupt counters for IPI handlers.
1647  */
1648 static void
1649 mp_ipi_intrcnt(void *dummy)
1650 {
1651 	char buf[64];
1652 	int i;
1653 
1654 	CPU_FOREACH(i) {
1655 		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
1656 		intrcnt_add(buf, &ipi_invltlb_counts[i]);
1657 		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
1658 		intrcnt_add(buf, &ipi_invlrng_counts[i]);
1659 		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
1660 		intrcnt_add(buf, &ipi_invlpg_counts[i]);
1661 		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
1662 		intrcnt_add(buf, &ipi_preempt_counts[i]);
1663 		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
1664 		intrcnt_add(buf, &ipi_ast_counts[i]);
1665 		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
1666 		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1667 		snprintf(buf, sizeof(buf), "cpu%d:lazypmap", i);
1668 		intrcnt_add(buf, &ipi_lazypmap_counts[i]);
1669 		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
1670 		intrcnt_add(buf, &ipi_hardclock_counts[i]);
1671 	}
1672 }
1673 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
1674 #endif
1675