xref: /dragonfly/sys/platform/pc64/apic/lapic.c (revision a42bad2d)
1 /*
2  * Copyright (c) 1996, by Steve Passe
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. The name of the developer may NOT be used to endorse or promote products
11  *    derived from this software without specific prior written permission.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/i386/i386/mpapic.c,v 1.37.2.7 2003/01/25 02:31:47 peter Exp $
26  */
27 
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/ktr.h>
32 #include <sys/bus.h>
33 #include <sys/machintr.h>
34 #include <machine/globaldata.h>
35 #include <machine/clock.h>
36 #include <machine/limits.h>
37 #include <machine/smp.h>
38 #include <machine/md_var.h>
39 #include <machine/pmap.h>
40 #include <machine/specialreg.h>
41 #include <machine_base/apic/lapic.h>
42 #include <machine_base/apic/ioapic.h>
43 #include <machine_base/apic/ioapic_abi.h>
44 #include <machine_base/apic/apicvar.h>
45 #include <machine_base/icu/icu_var.h>
46 #include <machine/segments.h>
47 #include <sys/thread2.h>
48 #include <sys/spinlock2.h>
49 
50 #include <machine/cputypes.h>
51 #include <machine/intr_machdep.h>
52 
53 #if !defined(KTR_LAPIC)
54 #define KTR_LAPIC	KTR_ALL
55 #endif
56 KTR_INFO_MASTER(lapic);
57 KTR_INFO(KTR_LAPIC, lapic, eoi, 0, "eoi");
58 #define log_lapic(name)     KTR_LOG(lapic_ ## name)
59 
60 extern int naps;
61 
62 volatile lapic_t *lapic;
63 
64 static void	lapic_timer_calibrate(void);
65 static void	lapic_timer_set_divisor(int);
66 static void	lapic_timer_fixup_handler(void *);
67 static void	lapic_timer_restart_handler(void *);
68 
69 
70 static int	lapic_timer_enable = 1;
71 TUNABLE_INT("hw.lapic_timer_enable", &lapic_timer_enable);
72 
73 static int	lapic_timer_tscdeadline = 1;
74 TUNABLE_INT("hw.lapic_timer_tscdeadline", &lapic_timer_tscdeadline);
75 
76 static int	lapic_calibrate_test = 0;
77 TUNABLE_INT("hw.lapic_calibrate_test", &lapic_calibrate_test);
78 
79 static int	lapic_calibrate_fast = 1;
80 TUNABLE_INT("hw.lapic_calibrate_fast", &lapic_calibrate_fast);
81 
82 static void	lapic_timer_tscdlt_reload(struct cputimer_intr *, sysclock_t);
83 static void	lapic_timer_intr_reload(struct cputimer_intr *, sysclock_t);
84 static void	lapic_timer_intr_enable(struct cputimer_intr *);
85 static void	lapic_timer_intr_restart(struct cputimer_intr *);
86 static void	lapic_timer_intr_pmfixup(struct cputimer_intr *);
87 
88 static struct cputimer_intr lapic_cputimer_intr = {
89 	.freq = 0,
90 	.reload = lapic_timer_intr_reload,
91 	.enable = lapic_timer_intr_enable,
92 	.config = cputimer_intr_default_config,
93 	.restart = lapic_timer_intr_restart,
94 	.pmfixup = lapic_timer_intr_pmfixup,
95 	.initclock = cputimer_intr_default_initclock,
96 	.pcpuhand = NULL,
97 	.next = SLIST_ENTRY_INITIALIZER,
98 	.name = "lapic",
99 	.type = CPUTIMER_INTR_LAPIC,
100 	.prio = CPUTIMER_INTR_PRIO_LAPIC,
101 	.caps = CPUTIMER_INTR_CAP_NONE,
102 	.priv = NULL
103 };
104 
105 static int		lapic_timer_divisor_idx = -1;
106 static const uint32_t	lapic_timer_divisors[] = {
107 	APIC_TDCR_2,	APIC_TDCR_4,	APIC_TDCR_8,	APIC_TDCR_16,
108 	APIC_TDCR_32,	APIC_TDCR_64,	APIC_TDCR_128,	APIC_TDCR_1
109 };
110 #define APIC_TIMER_NDIVISORS (int)(NELEM(lapic_timer_divisors))
111 
112 static int	lapic_use_tscdeadline = 0;
113 /* The raw TSC frequency might not fit into a sysclock_t value. */
114 static int	lapic_timer_tscfreq_shift;
115 
116 /*
117  * APIC ID <-> CPU ID mapping structures.
118  */
119 int	cpu_id_to_apic_id[NAPICID];
120 int	apic_id_to_cpu_id[NAPICID];
121 int	lapic_enable = 1;
122 
123 /* Separate cachelines for each cpu's info. */
124 struct deadlines {
125 	uint64_t timestamp;
126 	uint64_t downcount_time;
127 	uint64_t padding[6];
128 };
129 struct deadlines *tsc_deadlines = NULL;
130 
131 static void	lapic_eoi_func(void);
132 
133 void		(*lapic_eoi)(void);
134 
135 /*
136  * Enable LAPIC, configure interrupts.
137  */
138 void
139 lapic_init(boolean_t bsp)
140 {
141 	uint32_t timer;
142 	u_int   temp;
143 
144 	if (bsp) {
145 		/* Decide whether we want to use TSC Deadline mode. */
146 		if (lapic_timer_tscdeadline != 0 &&
147 		    (cpu_feature2 & CPUID2_TSCDLT) &&
148 		    tsc_invariant && tsc_frequency != 0) {
149 			lapic_use_tscdeadline = 1;
150 			tsc_deadlines = kmalloc_cachealign(
151 			    sizeof(struct deadlines) * (naps + 1),
152 			    M_DEVBUF, M_WAITOK | M_ZERO);
153 		}
154 	}
155 
156 	/*
157 	 * Install vectors
158 	 *
159 	 * Since IDT is shared between BSP and APs, these vectors
160 	 * only need to be installed once; we do it on BSP.
161 	 */
162 	if (bsp) {
163 		if (cpu_vendor_id == CPU_VENDOR_AMD &&
164 		    CPUID_TO_FAMILY(cpu_id) >= 0x0f &&
165 		    CPUID_TO_FAMILY(cpu_id) < 0x17) {	/* XXX */
166 			uint32_t tcr;
167 
168 			/*
169 			 * Set the LINTEN bit in the HyperTransport
170 			 * Transaction Control Register.
171 			 *
172 			 * This will cause EXTINT and NMI interrupts
173 			 * routed over the hypertransport bus to be
174 			 * fed into the LAPIC LINT0/LINT1.  If the bit
175 			 * isn't set, the interrupts will go to the
176 			 * general cpu INTR/NMI pins.  On a dual-core
177 			 * cpu the interrupt winds up going to BOTH cpus.
178 			 * The first cpu that does the interrupt ack
179 			 * cycle will get the correct interrupt.  The
180 			 * second cpu that does it will get a spurious
181 			 * interrupt vector (typically IRQ 7).
182 			 */
183 			outl(0x0cf8,
184 			    (1 << 31) |	/* enable */
185 			    (0 << 16) |	/* bus */
186 			    (0x18 << 11) | /* dev (cpu + 0x18) */
187 			    (0 << 8) |	/* func */
188 			    0x68	/* reg */
189 			    );
190 			tcr = inl(0xcfc);
191 			if ((tcr & 0x00010000) == 0) {
192 				kprintf("LAPIC: AMD LINTEN on\n");
193 				outl(0xcfc, tcr|0x00010000);
194 			}
195 			outl(0x0cf8, 0);
196 		}
197 
198 		/* Install a 'Spurious INTerrupt' vector */
199 		setidt_global(XSPURIOUSINT_OFFSET, Xspuriousint,
200 		    SDT_SYSIGT, SEL_KPL, 0);
201 
202 		/* Install a timer vector */
203 		setidt_global(XTIMER_OFFSET, Xtimer,
204 		    SDT_SYSIGT, SEL_KPL, 0);
205 
206 		/* Install an inter-CPU IPI for TLB invalidation */
207 		setidt_global(XINVLTLB_OFFSET, Xinvltlb,
208 		    SDT_SYSIGT, SEL_KPL, 0);
209 
210 		/* Install an inter-CPU IPI for IPIQ messaging */
211 		setidt_global(XIPIQ_OFFSET, Xipiq,
212 		    SDT_SYSIGT, SEL_KPL, 0);
213 
214 		/* Install an inter-CPU IPI for CPU stop/restart */
215 		setidt_global(XCPUSTOP_OFFSET, Xcpustop,
216 		    SDT_SYSIGT, SEL_KPL, 0);
217 
218 		/* Install an inter-CPU IPI for TLB invalidation */
219 		setidt_global(XSNIFF_OFFSET, Xsniff,
220 		    SDT_SYSIGT, SEL_KPL, 0);
221 	}
222 
223 	/*
224 	 * Setup LINT0 as ExtINT on the BSP.  This is theoretically an
225 	 * aggregate interrupt input from the 8259.  The INTA cycle
226 	 * will be routed to the external controller (the 8259) which
227 	 * is expected to supply the vector.
228 	 *
229 	 * Must be setup edge triggered, active high.
230 	 *
231 	 * Disable LINT0 on BSP, if I/O APIC is enabled.
232 	 *
233 	 * Disable LINT0 on the APs.  It doesn't matter what delivery
234 	 * mode we use because we leave it masked.
235 	 */
236 	temp = lapic->lvt_lint0;
237 	temp &= ~(APIC_LVT_MASKED | APIC_LVT_TRIG_MASK |
238 		  APIC_LVT_POLARITY_MASK | APIC_LVT_DM_MASK);
239 	if (bsp) {
240 		temp |= APIC_LVT_DM_EXTINT;
241 		if (ioapic_enable)
242 			temp |= APIC_LVT_MASKED;
243 	} else {
244 		temp |= APIC_LVT_DM_FIXED | APIC_LVT_MASKED;
245 	}
246 	lapic->lvt_lint0 = temp;
247 
248 	/*
249 	 * Setup LINT1 as NMI.
250 	 *
251 	 * Must be setup edge trigger, active high.
252 	 *
253 	 * Enable LINT1 on BSP, if I/O APIC is enabled.
254 	 *
255 	 * Disable LINT1 on the APs.
256 	 */
257 	temp = lapic->lvt_lint1;
258 	temp &= ~(APIC_LVT_MASKED | APIC_LVT_TRIG_MASK |
259 		  APIC_LVT_POLARITY_MASK | APIC_LVT_DM_MASK);
260 	temp |= APIC_LVT_MASKED | APIC_LVT_DM_NMI;
261 	if (bsp && ioapic_enable)
262 		temp &= ~APIC_LVT_MASKED;
263 	lapic->lvt_lint1 = temp;
264 
265 	/*
266 	 * Mask the LAPIC error interrupt, LAPIC performance counter
267 	 * interrupt.
268 	 */
269 	lapic->lvt_error = lapic->lvt_error | APIC_LVT_MASKED;
270 	lapic->lvt_pcint = lapic->lvt_pcint | APIC_LVT_MASKED;
271 
272 	/*
273 	 * Set LAPIC timer vector and mask the LAPIC timer interrupt.
274 	 */
275 	timer = lapic->lvt_timer;
276 	timer &= ~APIC_LVTT_VECTOR;
277 	timer |= XTIMER_OFFSET;
278 	timer |= APIC_LVTT_MASKED;
279 	lapic->lvt_timer = timer;
280 
281 	/*
282 	 * Set the Task Priority Register as needed.   At the moment allow
283 	 * interrupts on all cpus (the APs will remain CLId until they are
284 	 * ready to deal).
285 	 */
286 	temp = lapic->tpr;
287 	temp &= ~APIC_TPR_PRIO;		/* clear priority field */
288 	lapic->tpr = temp;
289 
290 	/*
291 	 * AMD specific setup
292 	 */
293 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
294 	    (lapic->version & APIC_VER_AMD_EXT_SPACE)) {
295 		uint32_t ext_feat;
296 		uint32_t count;
297 		uint32_t max_count;
298 		uint32_t lvt;
299 		uint32_t i;
300 
301 		ext_feat = lapic->ext_feat;
302 		count = (ext_feat & APIC_EXTFEAT_MASK) >> APIC_EXTFEAT_SHIFT;
303 		max_count = sizeof(lapic->ext_lvt) / sizeof(lapic->ext_lvt[0]);
304 		if (count > max_count)
305 			count = max_count;
306 		for (i = 0; i < count; ++i) {
307 			lvt = lapic->ext_lvt[i].lvt;
308 
309 			lvt &= ~(APIC_LVT_POLARITY_MASK | APIC_LVT_TRIG_MASK |
310 				 APIC_LVT_DM_MASK | APIC_LVT_MASKED);
311 			lvt |= APIC_LVT_MASKED | APIC_LVT_DM_FIXED;
312 
313 			switch(i) {
314 			case APIC_EXTLVT_IBS:
315 				break;
316 			case APIC_EXTLVT_MCA:
317 				break;
318 			case APIC_EXTLVT_DEI:
319 				break;
320 			case APIC_EXTLVT_SBI:
321 				break;
322 			default:
323 				break;
324 			}
325 			if (bsp) {
326 				kprintf("   LAPIC AMD elvt%d: 0x%08x",
327 					i, lapic->ext_lvt[i].lvt);
328 				if (lapic->ext_lvt[i].lvt != lvt)
329 					kprintf(" -> 0x%08x", lvt);
330 				kprintf("\n");
331 			}
332 			lapic->ext_lvt[i].lvt = lvt;
333 		}
334 	}
335 
336 	/*
337 	 * Enable the LAPIC
338 	 */
339 	temp = lapic->svr;
340 	temp |= APIC_SVR_ENABLE;	/* enable the LAPIC */
341 	temp &= ~APIC_SVR_FOCUS_DISABLE; /* enable lopri focus processor */
342 
343 	if (lapic->version & APIC_VER_EOI_SUPP) {
344 		if (temp & APIC_SVR_EOI_SUPP) {
345 			temp &= ~APIC_SVR_EOI_SUPP;
346 			if (bsp)
347 				kprintf("    LAPIC disabling EOI supp\n");
348 		}
349 	}
350 
351 	/*
352 	 * Set the spurious interrupt vector.  The low 4 bits of the vector
353 	 * must be 1111.
354 	 */
355 	if ((XSPURIOUSINT_OFFSET & 0x0F) != 0x0F)
356 		panic("bad XSPURIOUSINT_OFFSET: 0x%08x", XSPURIOUSINT_OFFSET);
357 	temp &= ~APIC_SVR_VECTOR;
358 	temp |= XSPURIOUSINT_OFFSET;
359 
360 	lapic->svr = temp;
361 
362 	/*
363 	 * Pump out a few EOIs to clean out interrupts that got through
364 	 * before we were able to set the TPR.
365 	 */
366 	lapic->eoi = 0;
367 	lapic->eoi = 0;
368 	lapic->eoi = 0;
369 
370 	if (bsp) {
371 		lapic_timer_calibrate();
372 		if (lapic_timer_enable) {
373 			if (cpu_thermal_feature & CPUID_THERMAL_ARAT) {
374 				/*
375 				 * Local APIC timer will not stop
376 				 * in deep C-state.
377 				 */
378 				lapic_cputimer_intr.caps |=
379 				    CPUTIMER_INTR_CAP_PS;
380 			}
381 			if (lapic_use_tscdeadline) {
382 				lapic_cputimer_intr.reload =
383 				    lapic_timer_tscdlt_reload;
384 			}
385 			cputimer_intr_register(&lapic_cputimer_intr);
386 			cputimer_intr_select(&lapic_cputimer_intr, 0);
387 		}
388 	} else if (!lapic_use_tscdeadline) {
389 		lapic_timer_set_divisor(lapic_timer_divisor_idx);
390 	}
391 
392 	if (bootverbose)
393 		apic_dump("apic_initialize()");
394 }
395 
396 static void
397 lapic_timer_set_divisor(int divisor_idx)
398 {
399 	KKASSERT(divisor_idx >= 0 && divisor_idx < APIC_TIMER_NDIVISORS);
400 	lapic->dcr_timer = lapic_timer_divisors[divisor_idx];
401 }
402 
403 static void
404 lapic_timer_oneshot(u_int count)
405 {
406 	uint32_t value;
407 
408 	value = lapic->lvt_timer;
409 	value &= ~(APIC_LVTT_PERIODIC | APIC_LVTT_TSCDLT);
410 	lapic->lvt_timer = value;
411 	lapic->icr_timer = count;
412 }
413 
414 static void
415 lapic_timer_oneshot_quick(u_int count)
416 {
417 	lapic->icr_timer = count;
418 }
419 
420 static void
421 lapic_timer_tscdeadline_quick(uint64_t diff)
422 {
423 	uint64_t val = rdtsc() + diff;
424 
425 	wrmsr(MSR_TSC_DEADLINE, val);
426 	tsc_deadlines[mycpuid].timestamp = val;
427 }
428 
429 static uint64_t
430 lapic_scale_to_tsc(unsigned value, unsigned scale)
431 {
432 	uint64_t val;
433 
434 	val = value;
435 	val *= tsc_frequency;
436 	val += (scale - 1);
437 	val /= scale;
438 	return val;
439 }
440 
441 #define MAX_MEASURE_RETRIES	100
442 
443 static u_int64_t
444 do_tsc_calibration(u_int us, u_int64_t apic_delay_tsc)
445 {
446 	u_int64_t old_tsc1, old_tsc2, new_tsc1, new_tsc2;
447 	u_int64_t diff, count;
448 	u_int64_t a;
449 	u_int32_t start, end;
450 	int retries1 = 0, retries2 = 0;
451 
452 retry1:
453 	lapic_timer_oneshot_quick(APIC_TIMER_MAX_COUNT);
454 	old_tsc1 = rdtsc_ordered();
455 	start = lapic->ccr_timer;
456 	old_tsc2 = rdtsc_ordered();
457 	if (apic_delay_tsc > 0 && retries1 < MAX_MEASURE_RETRIES &&
458 	    old_tsc2 - old_tsc1 > 2 * apic_delay_tsc) {
459 		retries1++;
460 		goto retry1;
461 	}
462 	DELAY(us);
463 retry2:
464 	new_tsc1 = rdtsc_ordered();
465 	end = lapic->ccr_timer;
466 	new_tsc2 = rdtsc_ordered();
467 	if (apic_delay_tsc > 0 && retries2 < MAX_MEASURE_RETRIES &&
468 	    new_tsc2 - new_tsc1 > 2 * apic_delay_tsc) {
469 		retries2++;
470 		goto retry2;
471 	}
472 	if (end == 0)
473 		return 0;
474 
475 	count = start - end;
476 
477 	/* Make sure the lapic can count for up to 2s */
478 	a = (unsigned)APIC_TIMER_MAX_COUNT;
479 	if (us < 2000000 && (u_int64_t)count * 2000000 >= a * us)
480 		return 0;
481 
482 	if (lapic_calibrate_test > 0 && (retries1 > 0 || retries2 > 0)) {
483 		kprintf("%s: retries1=%d retries2=%d\n",
484 		    __func__, retries1, retries2);
485 	}
486 
487 	diff = (new_tsc1 - old_tsc1) + (new_tsc2 - old_tsc2);
488 	/* XXX First estimate if the total TSC diff value makes sense */
489 	/* This will almost overflow, but only almost :) */
490 	count = (2 * count * tsc_frequency) / diff;
491 
492 	return count;
493 }
494 
495 static uint64_t
496 do_cputimer_calibration(u_int us)
497 {
498 	sysclock_t value;
499 	sysclock_t start, end, beginning, finish;
500 
501 	lapic_timer_oneshot(APIC_TIMER_MAX_COUNT);
502 	beginning = lapic->ccr_timer;
503 	start = sys_cputimer->count();
504 	DELAY(us);
505 	end = sys_cputimer->count();
506 	finish = lapic->ccr_timer;
507 	if (finish == 0)
508 		return 0;
509 	/* value is the LAPIC timer difference. */
510 	value = beginning - finish;
511 	/* end is the sys_cputimer difference. */
512 	end -= start;
513 	if (end == 0)
514 		return 0;
515 	value = ((uint64_t)value * sys_cputimer->freq) / end;
516 	return value;
517 }
518 
519 static void
520 lapic_timer_calibrate(void)
521 {
522 	sysclock_t value;
523 	u_int64_t apic_delay_tsc = 0;
524 	int use_tsc_calibration = 0;
525 
526 	/* No need to calibrate lapic_timer, if we will use TSC Deadline mode */
527 	if (lapic_use_tscdeadline) {
528 		lapic_timer_tscfreq_shift = 0;
529 		while ((tsc_frequency >> lapic_timer_tscfreq_shift) > INT_MAX)
530 			lapic_timer_tscfreq_shift++;
531 		lapic_cputimer_intr.freq =
532 		    tsc_frequency >> lapic_timer_tscfreq_shift;
533 		kprintf(
534 		    "lapic: TSC Deadline Mode: shift %d, frequency %u Hz\n",
535 		    lapic_timer_tscfreq_shift, lapic_cputimer_intr.freq);
536 		return;
537 	}
538 
539 	/*
540 	 * On real hardware, tsc_invariant == 0 wouldn't be an issue, but in
541 	 * a virtual machine the frequency may get changed by the host.
542 	 */
543 	if (tsc_frequency != 0 && tsc_invariant && lapic_calibrate_fast)
544 		use_tsc_calibration = 1;
545 
546 	if (use_tsc_calibration) {
547 		u_int64_t min_apic_tsc = 0, max_apic_tsc = 0;
548 		u_int64_t old_tsc, new_tsc;
549 		sysclock_t val;
550 		int i;
551 
552 		/* warm up */
553 		lapic_timer_oneshot(APIC_TIMER_MAX_COUNT);
554 		for (i = 0; i < 10; i++)
555 			val = lapic->ccr_timer;
556 
557 		for (i = 0; i < 100; i++) {
558 			old_tsc = rdtsc_ordered();
559 			val = lapic->ccr_timer;
560 			new_tsc = rdtsc_ordered();
561 			new_tsc -= old_tsc;
562 			apic_delay_tsc += new_tsc;
563 			if (min_apic_tsc == 0 ||
564 			    min_apic_tsc > new_tsc) {
565 				min_apic_tsc = new_tsc;
566 			}
567 			if (max_apic_tsc < new_tsc)
568 				max_apic_tsc = new_tsc;
569 		}
570 		apic_delay_tsc /= 100;
571 		kprintf(
572 		    "LAPIC latency (in TSC ticks): %lu min: %lu max: %lu\n",
573 		    apic_delay_tsc, min_apic_tsc, max_apic_tsc);
574 		apic_delay_tsc = min_apic_tsc;
575 	}
576 
577 	if (!use_tsc_calibration) {
578 		int i;
579 
580 		/*
581 		 * Do some exercising of the lapic timer access. This improves
582 		 * precision of the subsequent calibration run in at least some
583 		 * virtualization cases.
584 		 */
585 		lapic_timer_set_divisor(0);
586 		for (i = 0; i < 10; i++)
587 			(void)do_cputimer_calibration(100);
588 	}
589 	/* Try to calibrate the local APIC timer. */
590 	for (lapic_timer_divisor_idx = 0;
591 	     lapic_timer_divisor_idx < APIC_TIMER_NDIVISORS;
592 	     lapic_timer_divisor_idx++) {
593 		lapic_timer_set_divisor(lapic_timer_divisor_idx);
594 		if (use_tsc_calibration) {
595 			value = do_tsc_calibration(200*1000, apic_delay_tsc);
596 		} else {
597 			value = do_cputimer_calibration(2*1000*1000);
598 		}
599 		if (value != 0)
600 			break;
601 	}
602 	if (lapic_timer_divisor_idx >= APIC_TIMER_NDIVISORS)
603 		panic("lapic: no proper timer divisor?!");
604 	lapic_cputimer_intr.freq = value;
605 
606 	kprintf("lapic: divisor index %d, frequency %u Hz\n",
607 		lapic_timer_divisor_idx, lapic_cputimer_intr.freq);
608 
609 	if (lapic_calibrate_test > 0) {
610 		uint64_t freq;
611 		int i;
612 
613 		for (i = 1; i <= 20; i++) {
614 			if (use_tsc_calibration) {
615 				freq = do_tsc_calibration(i*100*1000,
616 				    apic_delay_tsc);
617 			} else {
618 				freq = do_cputimer_calibration(i*100*1000);
619 			}
620 			if (freq != 0)
621 				kprintf("%ums: %lu\n", i * 100, freq);
622 		}
623 	}
624 }
625 
626 static void
627 lapic_timer_tscdlt_reload(struct cputimer_intr *cti, sysclock_t reload)
628 {
629 	struct globaldata *gd = mycpu;
630 	uint64_t diff, now, val;
631 
632 	if (reload > 1000*1000*1000)
633 		reload = 1000*1000*1000;
634 	diff = (uint64_t)reload * tsc_frequency / sys_cputimer->freq;
635 	if (diff < 4)
636 		diff = 4;
637 	if (cpu_vendor_id == CPU_VENDOR_INTEL)
638 		cpu_lfence();
639 	else
640 		cpu_mfence();
641 	now = rdtsc();
642 	val = now + diff;
643 	if (gd->gd_timer_running) {
644 		uint64_t deadline = tsc_deadlines[mycpuid].timestamp;
645 		if (deadline == 0 || now > deadline || val < deadline) {
646 			wrmsr(MSR_TSC_DEADLINE, val);
647 			tsc_deadlines[mycpuid].timestamp = val;
648 		}
649 	} else {
650 		gd->gd_timer_running = 1;
651 		wrmsr(MSR_TSC_DEADLINE, val);
652 		tsc_deadlines[mycpuid].timestamp = val;
653 	}
654 }
655 
656 static void
657 lapic_timer_intr_reload(struct cputimer_intr *cti, sysclock_t reload)
658 {
659 	struct globaldata *gd = mycpu;
660 
661 	reload = (int64_t)reload * cti->freq / sys_cputimer->freq;
662 	if (reload < 2)
663 		reload = 2;
664 
665 	if (gd->gd_timer_running) {
666 		if (reload < lapic->ccr_timer)
667 			lapic_timer_oneshot_quick(reload);
668 	} else {
669 		gd->gd_timer_running = 1;
670 		lapic_timer_oneshot_quick(reload);
671 	}
672 }
673 
674 static void
675 lapic_timer_intr_enable(struct cputimer_intr *cti __unused)
676 {
677 	uint32_t timer;
678 
679 	timer = lapic->lvt_timer;
680 	timer &= ~(APIC_LVTT_MASKED | APIC_LVTT_PERIODIC | APIC_LVTT_TSCDLT);
681 	if (lapic_use_tscdeadline)
682 		timer |= APIC_LVTT_TSCDLT;
683 	lapic->lvt_timer = timer;
684 	if (lapic_use_tscdeadline)
685 		cpu_mfence();
686 
687 	lapic_timer_fixup_handler(NULL);
688 }
689 
690 static void
691 lapic_timer_fixup_handler(void *arg)
692 {
693 	int *started = arg;
694 
695 	if (started != NULL)
696 		*started = 0;
697 
698 	if (cpu_vendor_id == CPU_VENDOR_AMD) {
699 		/*
700 		 * Detect the presence of C1E capability mostly on latest
701 		 * dual-cores (or future) k8 family.  This feature renders
702 		 * the local APIC timer dead, so we disable it by reading
703 		 * the Interrupt Pending Message register and clearing both
704 		 * C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
705 		 *
706 		 * Reference:
707 		 *   "BIOS and Kernel Developer's Guide for AMD NPT
708 		 *    Family 0Fh Processors"
709 		 *   #32559 revision 3.00
710 		 */
711 		if ((cpu_id & 0x00000f00) == 0x00000f00 &&
712 		    (cpu_id & 0x0fff0000) >= 0x00040000) {
713 			uint64_t msr;
714 
715 			msr = rdmsr(0xc0010055);
716 			if (msr & 0x18000000) {
717 				struct globaldata *gd = mycpu;
718 
719 				kprintf("cpu%d: AMD C1E detected\n",
720 					gd->gd_cpuid);
721 				wrmsr(0xc0010055, msr & ~0x18000000ULL);
722 
723 				/*
724 				 * We are kinda stalled;
725 				 * kick start again.
726 				 */
727 				gd->gd_timer_running = 1;
728 				if (lapic_use_tscdeadline) {
729 					/* Maybe reached in Virtual Machines? */
730 					lapic_timer_tscdeadline_quick(5000);
731 				} else {
732 					lapic_timer_oneshot_quick(2);
733 				}
734 
735 				if (started != NULL)
736 					*started = 1;
737 			}
738 		}
739 	}
740 }
741 
742 static void
743 lapic_timer_restart_handler(void *dummy __unused)
744 {
745 	int started;
746 
747 	lapic_timer_fixup_handler(&started);
748 	if (!started) {
749 		struct globaldata *gd = mycpu;
750 
751 		gd->gd_timer_running = 1;
752 		if (lapic_use_tscdeadline) {
753 			/* Maybe reached in Virtual Machines? */
754 			lapic_timer_tscdeadline_quick(5000);
755 		} else {
756 			lapic_timer_oneshot_quick(2);
757 		}
758 	}
759 }
760 
761 /*
762  * This function is called only by ACPICA code currently:
763  * - AMD C1E fixup.  AMD C1E only seems to happen after ACPI
764  *   module controls PM.  So once ACPICA is attached, we try
765  *   to apply the fixup to prevent LAPIC timer from hanging.
766  */
767 static void
768 lapic_timer_intr_pmfixup(struct cputimer_intr *cti __unused)
769 {
770 	lwkt_send_ipiq_mask(smp_active_mask,
771 			    lapic_timer_fixup_handler, NULL);
772 }
773 
774 static void
775 lapic_timer_intr_restart(struct cputimer_intr *cti __unused)
776 {
777 	lwkt_send_ipiq_mask(smp_active_mask, lapic_timer_restart_handler, NULL);
778 }
779 
780 
781 /*
782  * dump contents of local APIC registers
783  */
784 void
785 apic_dump(char* str)
786 {
787 	kprintf("SMP: CPU%d %s:\n", mycpu->gd_cpuid, str);
788 	kprintf("     lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
789 		lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr);
790 }
791 
792 /*
793  * Inter Processor Interrupt functions.
794  */
795 
796 /*
797  * Send APIC IPI 'vector' to 'destType' via 'deliveryMode'.
798  *
799  *  destType is 1 of: APIC_DEST_SELF, APIC_DEST_ALLISELF, APIC_DEST_ALLESELF
800  *  vector is any valid SYSTEM INT vector
801  *  delivery_mode is 1 of: APIC_DELMODE_FIXED, APIC_DELMODE_LOWPRIO
802  *
803  * WARNINGS!
804  *
805  * We now implement a per-cpu interlock (gd->gd_npoll) to prevent more than
806  * one IPI from being sent to any given cpu at a time.  Thus we no longer
807  * have to process incoming IPIs while waiting for the status to clear.
808  * No deadlock should be possible.
809  *
810  * We now physically disable interrupts for the lapic ICR operation.  If
811  * we do not do this then it looks like an EOI sent to the lapic (which
812  * occurs even with a critical section) can interfere with the command
813  * register ready status and cause an IPI to be lost.
814  *
815  * e.g. an interrupt can occur, issue the EOI, IRET, and cause the command
816  * register to busy just before we write to icr_lo, resulting in a lost
817  * issuance.  This only appears to occur on Intel cpus and is not
818  * documented.  It could simply be that cpus are so fast these days that
819  * it was always an issue, but is only now rearing its ugly head.  This
820  * is conjecture.
821  */
822 int
823 apic_ipi(int dest_type, int vector, int delivery_mode)
824 {
825 	uint32_t icr_hi;
826 	uint32_t icr_lo;
827 	int64_t tsc;
828 	int loops = 1;
829 
830 	if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
831 		tsc = rdtsc();
832 		while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
833 			cpu_pause();
834 			if ((tsc_sclock_t)(rdtsc() -
835 					   (tsc + tsc_frequency)) > 0) {
836 				kprintf("apic_ipi stall cpu %d (sing)\n",
837 					mycpuid);
838 				tsc = rdtsc();
839 				if (++loops > 30)
840 					panic("apic stall");
841 			}
842 		}
843 	}
844 	icr_hi = lapic->icr_hi & ~APIC_ID_MASK;
845 	icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) | dest_type |
846 		 APIC_LEVEL_ASSERT | delivery_mode | vector;
847 	lapic->icr_hi = icr_hi;
848 	lapic->icr_lo = icr_lo;
849 
850 	return 0;
851 }
852 
853 /*
854  * Interrupts must be hard-disabled by caller
855  */
856 void
857 single_apic_ipi(int cpu, int vector, int delivery_mode)
858 {
859 	uint32_t  icr_lo;
860 	uint32_t  icr_hi;
861 	int64_t tsc;
862 	int loops = 1;
863 
864 	if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
865 		tsc = rdtsc();
866 		while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
867 			cpu_pause();
868 			if ((tsc_sclock_t)(rdtsc() -
869 					   (tsc + tsc_frequency)) > 0) {
870 				kprintf("single_apic_ipi stall cpu %d (sing)\n",
871 					mycpuid);
872 				tsc = rdtsc();
873 				if (++loops > 30)
874 					panic("apic stall");
875 			}
876 		}
877 	}
878 	icr_hi = lapic->icr_hi & ~APIC_ID_MASK;
879 	icr_hi |= (CPUID_TO_APICID(cpu) << 24);
880 
881 	/* build ICR_LOW */
882 	icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) |
883 		 APIC_LEVEL_ASSERT | APIC_DEST_DESTFLD | delivery_mode | vector;
884 
885 	/* write APIC ICR */
886 	lapic->icr_hi = icr_hi;
887 	lapic->icr_lo = icr_lo;
888 }
889 
890 #if 0
891 
892 /*
893  * Returns 0 if the apic is busy, 1 if we were able to queue the request.
894  *
895  * NOT WORKING YET!  The code as-is may end up not queueing an IPI at all
896  * to the target, and the scheduler does not 'poll' for IPI messages.
897  */
898 int
899 single_apic_ipi_passive(int cpu, int vector, int delivery_mode)
900 {
901 	u_long  icr_lo;
902 	u_long  icr_hi;
903 	unsigned long rflags;
904 
905 	rflags = read_rflags();
906 	cpu_disable_intr();
907 	if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
908 		write_rflags(rflags);
909 		return(0);
910 	}
911 	icr_hi = lapic->icr_hi & ~APIC_ID_MASK;
912 	icr_hi |= (CPUID_TO_APICID(cpu) << 24);
913 	lapic->icr_hi = icr_hi;
914 
915 	/* build IRC_LOW */
916 	icr_lo = (lapic->icr_lo & APIC_RESV2_MASK) |
917 		 APIC_DEST_DESTFLD | delivery_mode | vector;
918 
919 	/* write APIC ICR */
920 	lapic->icr_lo = icr_lo;
921 	write_rflags(rflags);
922 
923 	return(1);
924 }
925 
926 #endif
927 
928 /*
929  * Send APIC IPI 'vector' to 'target's via 'delivery_mode'.
930  *
931  * target is a bitmask of destination cpus.  Vector is any
932  * valid system INT vector.  Delivery mode may be either
933  * APIC_DELMODE_FIXED or APIC_DELMODE_LOWPRIO.
934  *
935  * Interrupts must be hard-disabled by caller
936  */
937 void
938 selected_apic_ipi(cpumask_t target, int vector, int delivery_mode)
939 {
940 	while (CPUMASK_TESTNZERO(target)) {
941 		int n = BSFCPUMASK(target);
942 		CPUMASK_NANDBIT(target, n);
943 		single_apic_ipi(n, vector, delivery_mode);
944 	}
945 }
946 
947 /*
948  * Load a 'downcount time' in uSeconds.
949  */
950 void
951 set_apic_timer(int us)
952 {
953 	u_int count;
954 
955 	if (lapic_use_tscdeadline) {
956 		uint64_t val;
957 
958 		val = lapic_scale_to_tsc(us, 1000000);
959 		val += rdtsc();
960 		/* No need to arm the lapic here, just track the timeout. */
961 		tsc_deadlines[mycpuid].downcount_time = val;
962 		return;
963 	}
964 
965 	/*
966 	 * When we reach here, lapic timer's frequency
967 	 * must have been calculated as well as the
968 	 * divisor (lapic->dcr_timer is setup during the
969 	 * divisor calculation).
970 	 */
971 	KKASSERT(lapic_cputimer_intr.freq != 0 &&
972 		 lapic_timer_divisor_idx >= 0);
973 
974 	count = ((us * (int64_t)lapic_cputimer_intr.freq) + 999999) / 1000000;
975 	lapic_timer_oneshot(count);
976 }
977 
978 
979 /*
980  * Read remaining time in timer, in microseconds (rounded up).
981  */
982 int
983 read_apic_timer(void)
984 {
985 	uint64_t val;
986 
987 	if (lapic_use_tscdeadline) {
988 		uint64_t now;
989 
990 		val = tsc_deadlines[mycpuid].downcount_time;
991 		now = rdtsc();
992 		if (val == 0 || now > val) {
993 			return 0;
994 		} else {
995 			val -= now;
996 			val *= 1000000;
997 			val += (tsc_frequency - 1);
998 			val /= tsc_frequency;
999 			if (val > INT_MAX)
1000 				val = INT_MAX;
1001 			return val;
1002 		}
1003 	}
1004 
1005 	val = lapic->ccr_timer;
1006 	if (val == 0)
1007 		return 0;
1008 
1009 	KKASSERT(lapic_cputimer_intr.freq > 0);
1010 	val *= 1000000;
1011 	val += (lapic_cputimer_intr.freq - 1);
1012 	val /= lapic_cputimer_intr.freq;
1013 	if (val > INT_MAX)
1014 		val = INT_MAX;
1015 	return val;
1016 }
1017 
1018 
1019 /*
1020  * Spin-style delay, set delay time in uS, spin till it drains.
1021  */
1022 void
1023 u_sleep(int count)
1024 {
1025 	set_apic_timer(count);
1026 	while (read_apic_timer())
1027 		 /* spin */ ;
1028 }
1029 
1030 int
1031 lapic_unused_apic_id(int start)
1032 {
1033 	int i;
1034 
1035 	for (i = start; i < APICID_MAX; ++i) {
1036 		if (APICID_TO_CPUID(i) == -1)
1037 			return i;
1038 	}
1039 	return NAPICID;
1040 }
1041 
1042 void
1043 lapic_map(vm_paddr_t lapic_addr)
1044 {
1045 	lapic = pmap_mapdev_uncacheable(lapic_addr, sizeof(struct LAPIC));
1046 }
1047 
1048 static TAILQ_HEAD(, lapic_enumerator) lapic_enumerators =
1049 	TAILQ_HEAD_INITIALIZER(lapic_enumerators);
1050 
1051 int
1052 lapic_config(void)
1053 {
1054 	struct lapic_enumerator *e;
1055 	int error, i, ap_max;
1056 
1057 	KKASSERT(lapic_enable);
1058 
1059 	for (i = 0; i < NAPICID; ++i)
1060 		APICID_TO_CPUID(i) = -1;
1061 
1062 	TAILQ_FOREACH(e, &lapic_enumerators, lapic_link) {
1063 		error = e->lapic_probe(e);
1064 		if (!error)
1065 			break;
1066 	}
1067 	if (e == NULL) {
1068 		kprintf("LAPIC: Can't find LAPIC\n");
1069 		return ENXIO;
1070 	}
1071 
1072 	error = e->lapic_enumerate(e);
1073 	if (error) {
1074 		kprintf("LAPIC: enumeration failed\n");
1075 		return ENXIO;
1076 	}
1077 
1078 	ap_max = MAXCPU - 1;
1079 	TUNABLE_INT_FETCH("hw.ap_max", &ap_max);
1080 	if (ap_max > MAXCPU - 1)
1081 		ap_max = MAXCPU - 1;
1082 
1083 	if (naps > ap_max) {
1084 		kprintf("LAPIC: Warning use only %d out of %d "
1085 			"available APs\n",
1086 			ap_max, naps);
1087 		naps = ap_max;
1088 	}
1089 
1090 	return 0;
1091 }
1092 
1093 void
1094 lapic_enumerator_register(struct lapic_enumerator *ne)
1095 {
1096 	struct lapic_enumerator *e;
1097 
1098 	TAILQ_FOREACH(e, &lapic_enumerators, lapic_link) {
1099 		if (e->lapic_prio < ne->lapic_prio) {
1100 			TAILQ_INSERT_BEFORE(e, ne, lapic_link);
1101 			return;
1102 		}
1103 	}
1104 	TAILQ_INSERT_TAIL(&lapic_enumerators, ne, lapic_link);
1105 }
1106 
1107 void
1108 lapic_set_cpuid(int cpu_id, int apic_id)
1109 {
1110 	CPUID_TO_APICID(cpu_id) = apic_id;
1111 	APICID_TO_CPUID(apic_id) = cpu_id;
1112 }
1113 
1114 void
1115 lapic_fixup_noioapic(void)
1116 {
1117 	u_int   temp;
1118 
1119 	/* Only allowed on BSP */
1120 	KKASSERT(mycpuid == 0);
1121 	KKASSERT(!ioapic_enable);
1122 
1123 	temp = lapic->lvt_lint0;
1124 	temp &= ~APIC_LVT_MASKED;
1125 	lapic->lvt_lint0 = temp;
1126 
1127 	temp = lapic->lvt_lint1;
1128 	temp |= APIC_LVT_MASKED;
1129 	lapic->lvt_lint1 = temp;
1130 }
1131 
1132 static void
1133 lapic_eoi_func(void)
1134 {
1135 	log_lapic(eoi);
1136 	lapic->eoi = 0;
1137 }
1138 
1139 static void
1140 lapic_sysinit(void *dummy __unused)
1141 {
1142 	if (lapic_enable) {
1143 		int error;
1144 
1145 		lapic_eoi = lapic_eoi_func;
1146 
1147 		error = lapic_config();
1148 		if (error)
1149 			lapic_enable = 0;
1150 	}
1151 
1152 	if (lapic_enable) {
1153 		/* Initialize BSP's local APIC */
1154 		lapic_init(TRUE);
1155 	} else if (ioapic_enable) {
1156 		ioapic_enable = 0;
1157 		icu_reinit_noioapic();
1158 	}
1159 }
1160 SYSINIT(lapic, SI_BOOT2_LAPIC, SI_ORDER_FIRST, lapic_sysinit, NULL);
1161