xref: /dragonfly/sys/platform/pc64/apic/lapic.c (revision 70344474)
1 /*
2  * Copyright (c) 1996, by Steve Passe
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. The name of the developer may NOT be used to endorse or promote products
11  *    derived from this software without specific prior written permission.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/i386/i386/mpapic.c,v 1.37.2.7 2003/01/25 02:31:47 peter Exp $
26  */
27 
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/bus.h>
32 #include <sys/machintr.h>
33 #include <machine/globaldata.h>
34 #include <machine/clock.h>
35 #include <machine/limits.h>
36 #include <machine/smp.h>
37 #include <machine/md_var.h>
38 #include <machine/pmap.h>
39 #include <machine/specialreg.h>
40 #include <machine_base/apic/lapic.h>
41 #include <machine_base/apic/ioapic.h>
42 #include <machine_base/apic/ioapic_abi.h>
43 #include <machine_base/apic/apicvar.h>
44 #include <machine_base/icu/icu_var.h>
45 #include <machine/segments.h>
46 #include <sys/thread2.h>
47 #include <sys/spinlock2.h>
48 
49 #include <machine/cputypes.h>
50 #include <machine/intr_machdep.h>
51 
52 extern int naps;
53 
54 volatile lapic_t *lapic;
55 
56 static void	lapic_timer_calibrate(void);
57 static void	lapic_timer_set_divisor(int);
58 static void	lapic_timer_fixup_handler(void *);
59 static void	lapic_timer_restart_handler(void *);
60 
61 
62 static int	lapic_timer_enable = 1;
63 TUNABLE_INT("hw.lapic_timer_enable", &lapic_timer_enable);
64 
65 static int	lapic_timer_tscdeadline = 1;
66 TUNABLE_INT("hw.lapic_timer_tscdeadline", &lapic_timer_tscdeadline);
67 
68 static int	lapic_calibrate_test = 0;
69 TUNABLE_INT("hw.lapic_calibrate_test", &lapic_calibrate_test);
70 
71 static int	lapic_calibrate_fast = 1;
72 TUNABLE_INT("hw.lapic_calibrate_fast", &lapic_calibrate_fast);
73 
74 static void	lapic_timer_tscdlt_reload(struct cputimer_intr *, sysclock_t);
75 static void	lapic_timer_intr_reload(struct cputimer_intr *, sysclock_t);
76 static void	lapic_timer_intr_enable(struct cputimer_intr *);
77 static void	lapic_timer_intr_restart(struct cputimer_intr *);
78 static void	lapic_timer_intr_pmfixup(struct cputimer_intr *);
79 
80 static struct cputimer_intr lapic_cputimer_intr = {
81 	.freq = 0,
82 	.reload = lapic_timer_intr_reload,
83 	.enable = lapic_timer_intr_enable,
84 	.config = cputimer_intr_default_config,
85 	.restart = lapic_timer_intr_restart,
86 	.pmfixup = lapic_timer_intr_pmfixup,
87 	.initclock = cputimer_intr_default_initclock,
88 	.pcpuhand = NULL,
89 	.next = SLIST_ENTRY_INITIALIZER,
90 	.name = "lapic",
91 	.type = CPUTIMER_INTR_LAPIC,
92 	.prio = CPUTIMER_INTR_PRIO_LAPIC,
93 	.caps = CPUTIMER_INTR_CAP_NONE,
94 	.priv = NULL
95 };
96 
97 static int		lapic_timer_divisor_idx = -1;
98 static const uint32_t	lapic_timer_divisors[] = {
99 	APIC_TDCR_2,	APIC_TDCR_4,	APIC_TDCR_8,	APIC_TDCR_16,
100 	APIC_TDCR_32,	APIC_TDCR_64,	APIC_TDCR_128,	APIC_TDCR_1
101 };
102 #define APIC_TIMER_NDIVISORS (int)(NELEM(lapic_timer_divisors))
103 
104 static int	lapic_use_tscdeadline = 0;
105 /* The raw TSC frequency might not fit into a sysclock_t value. */
106 static int	lapic_timer_tscfreq_shift;
107 
108 /*
109  * APIC ID <-> CPU ID mapping structures.
110  */
111 int	cpu_id_to_apic_id[NAPICID];
112 int	apic_id_to_cpu_id[NAPICID];
113 int	lapic_enable = 1;
114 
115 /* Separate cachelines for each cpu's info. */
116 struct deadlines {
117 	uint64_t timestamp;
118 	uint64_t downcount_time;
119 	uint64_t padding[6];
120 };
121 struct deadlines *tsc_deadlines = NULL;
122 
123 /*
124  * Enable LAPIC, configure interrupts.
125  */
126 void
127 lapic_init(boolean_t bsp)
128 {
129 	uint32_t timer;
130 	u_int   temp;
131 
132 	if (bsp) {
133 		/* Decide whether we want to use TSC Deadline mode. */
134 		if (lapic_timer_tscdeadline != 0 &&
135 		    (cpu_feature2 & CPUID2_TSCDLT) &&
136 		    tsc_invariant && tsc_frequency != 0) {
137 			lapic_use_tscdeadline = 1;
138 			tsc_deadlines = kmalloc_cachealign(
139 			    sizeof(struct deadlines) * (naps + 1),
140 			    M_DEVBUF, M_WAITOK | M_ZERO);
141 		}
142 	}
143 
144 	/*
145 	 * Install vectors
146 	 *
147 	 * Since IDT is shared between BSP and APs, these vectors
148 	 * only need to be installed once; we do it on BSP.
149 	 */
150 	if (bsp) {
151 		if (cpu_vendor_id == CPU_VENDOR_AMD &&
152 		    CPUID_TO_FAMILY(cpu_id) >= 0x0f &&
153 		    CPUID_TO_FAMILY(cpu_id) < 0x17) {	/* XXX */
154 			uint32_t tcr;
155 
156 			/*
157 			 * Set the LINTEN bit in the HyperTransport
158 			 * Transaction Control Register.
159 			 *
160 			 * This will cause EXTINT and NMI interrupts
161 			 * routed over the hypertransport bus to be
162 			 * fed into the LAPIC LINT0/LINT1.  If the bit
163 			 * isn't set, the interrupts will go to the
164 			 * general cpu INTR/NMI pins.  On a dual-core
165 			 * cpu the interrupt winds up going to BOTH cpus.
166 			 * The first cpu that does the interrupt ack
167 			 * cycle will get the correct interrupt.  The
168 			 * second cpu that does it will get a spurious
169 			 * interrupt vector (typically IRQ 7).
170 			 */
171 			outl(0x0cf8,
172 			    (1 << 31) |	/* enable */
173 			    (0 << 16) |	/* bus */
174 			    (0x18 << 11) | /* dev (cpu + 0x18) */
175 			    (0 << 8) |	/* func */
176 			    0x68	/* reg */
177 			    );
178 			tcr = inl(0xcfc);
179 			if ((tcr & 0x00010000) == 0) {
180 				kprintf("LAPIC: AMD LINTEN on\n");
181 				outl(0xcfc, tcr|0x00010000);
182 			}
183 			outl(0x0cf8, 0);
184 		}
185 
186 		/* Install a 'Spurious INTerrupt' vector */
187 		setidt_global(XSPURIOUSINT_OFFSET, Xspuriousint,
188 		    SDT_SYSIGT, SEL_KPL, 0);
189 
190 		/* Install a timer vector */
191 		setidt_global(XTIMER_OFFSET, Xtimer,
192 		    SDT_SYSIGT, SEL_KPL, 0);
193 
194 		/* Install an inter-CPU IPI for TLB invalidation */
195 		setidt_global(XINVLTLB_OFFSET, Xinvltlb,
196 		    SDT_SYSIGT, SEL_KPL, 0);
197 
198 		/* Install an inter-CPU IPI for IPIQ messaging */
199 		setidt_global(XIPIQ_OFFSET, Xipiq,
200 		    SDT_SYSIGT, SEL_KPL, 0);
201 
202 		/* Install an inter-CPU IPI for CPU stop/restart */
203 		setidt_global(XCPUSTOP_OFFSET, Xcpustop,
204 		    SDT_SYSIGT, SEL_KPL, 0);
205 
206 		/* Install an inter-CPU IPI for TLB invalidation */
207 		setidt_global(XSNIFF_OFFSET, Xsniff,
208 		    SDT_SYSIGT, SEL_KPL, 0);
209 	}
210 
211 	/*
212 	 * Setup LINT0 as ExtINT on the BSP.  This is theoretically an
213 	 * aggregate interrupt input from the 8259.  The INTA cycle
214 	 * will be routed to the external controller (the 8259) which
215 	 * is expected to supply the vector.
216 	 *
217 	 * Must be setup edge triggered, active high.
218 	 *
219 	 * Disable LINT0 on BSP, if I/O APIC is enabled.
220 	 *
221 	 * Disable LINT0 on the APs.  It doesn't matter what delivery
222 	 * mode we use because we leave it masked.
223 	 */
224 	temp = lapic->lvt_lint0;
225 	temp &= ~(APIC_LVT_MASKED | APIC_LVT_TRIG_MASK |
226 		  APIC_LVT_POLARITY_MASK | APIC_LVT_DM_MASK);
227 	if (bsp) {
228 		temp |= APIC_LVT_DM_EXTINT;
229 		if (ioapic_enable)
230 			temp |= APIC_LVT_MASKED;
231 	} else {
232 		temp |= APIC_LVT_DM_FIXED | APIC_LVT_MASKED;
233 	}
234 	lapic->lvt_lint0 = temp;
235 
236 	/*
237 	 * Setup LINT1 as NMI.
238 	 *
239 	 * Must be setup edge trigger, active high.
240 	 *
241 	 * Enable LINT1 on BSP, if I/O APIC is enabled.
242 	 *
243 	 * Disable LINT1 on the APs.
244 	 */
245 	temp = lapic->lvt_lint1;
246 	temp &= ~(APIC_LVT_MASKED | APIC_LVT_TRIG_MASK |
247 		  APIC_LVT_POLARITY_MASK | APIC_LVT_DM_MASK);
248 	temp |= APIC_LVT_MASKED | APIC_LVT_DM_NMI;
249 	if (bsp && ioapic_enable)
250 		temp &= ~APIC_LVT_MASKED;
251 	lapic->lvt_lint1 = temp;
252 
253 	/*
254 	 * Mask the LAPIC error interrupt, LAPIC performance counter
255 	 * interrupt.
256 	 */
257 	lapic->lvt_error = lapic->lvt_error | APIC_LVT_MASKED;
258 	lapic->lvt_pcint = lapic->lvt_pcint | APIC_LVT_MASKED;
259 
260 	/*
261 	 * Set LAPIC timer vector and mask the LAPIC timer interrupt.
262 	 */
263 	timer = lapic->lvt_timer;
264 	timer &= ~APIC_LVTT_VECTOR;
265 	timer |= XTIMER_OFFSET;
266 	timer |= APIC_LVTT_MASKED;
267 	lapic->lvt_timer = timer;
268 
269 	/*
270 	 * Set the Task Priority Register as needed.   At the moment allow
271 	 * interrupts on all cpus (the APs will remain CLId until they are
272 	 * ready to deal).
273 	 */
274 	temp = lapic->tpr;
275 	temp &= ~APIC_TPR_PRIO;		/* clear priority field */
276 	lapic->tpr = temp;
277 
278 	/*
279 	 * AMD specific setup
280 	 */
281 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
282 	    (lapic->version & APIC_VER_AMD_EXT_SPACE)) {
283 		uint32_t ext_feat;
284 		uint32_t count;
285 		uint32_t max_count;
286 		uint32_t lvt;
287 		uint32_t i;
288 
289 		ext_feat = lapic->ext_feat;
290 		count = (ext_feat & APIC_EXTFEAT_MASK) >> APIC_EXTFEAT_SHIFT;
291 		max_count = sizeof(lapic->ext_lvt) / sizeof(lapic->ext_lvt[0]);
292 		if (count > max_count)
293 			count = max_count;
294 		for (i = 0; i < count; ++i) {
295 			lvt = lapic->ext_lvt[i].lvt;
296 
297 			lvt &= ~(APIC_LVT_POLARITY_MASK | APIC_LVT_TRIG_MASK |
298 				 APIC_LVT_DM_MASK | APIC_LVT_MASKED);
299 			lvt |= APIC_LVT_MASKED | APIC_LVT_DM_FIXED;
300 
301 			switch(i) {
302 			case APIC_EXTLVT_IBS:
303 				break;
304 			case APIC_EXTLVT_MCA:
305 				break;
306 			case APIC_EXTLVT_DEI:
307 				break;
308 			case APIC_EXTLVT_SBI:
309 				break;
310 			default:
311 				break;
312 			}
313 			if (bsp) {
314 				kprintf("   LAPIC AMD elvt%d: 0x%08x",
315 					i, lapic->ext_lvt[i].lvt);
316 				if (lapic->ext_lvt[i].lvt != lvt)
317 					kprintf(" -> 0x%08x", lvt);
318 				kprintf("\n");
319 			}
320 			lapic->ext_lvt[i].lvt = lvt;
321 		}
322 	}
323 
324 	/*
325 	 * Enable the LAPIC
326 	 */
327 	temp = lapic->svr;
328 	temp |= APIC_SVR_ENABLE;	/* enable the LAPIC */
329 	temp &= ~APIC_SVR_FOCUS_DISABLE; /* enable lopri focus processor */
330 
331 	if (lapic->version & APIC_VER_EOI_SUPP) {
332 		if (temp & APIC_SVR_EOI_SUPP) {
333 			temp &= ~APIC_SVR_EOI_SUPP;
334 			if (bsp)
335 				kprintf("    LAPIC disabling EOI supp\n");
336 		}
337 	}
338 
339 	/*
340 	 * Set the spurious interrupt vector.  The low 4 bits of the vector
341 	 * must be 1111.
342 	 */
343 	if ((XSPURIOUSINT_OFFSET & 0x0F) != 0x0F)
344 		panic("bad XSPURIOUSINT_OFFSET: 0x%08x", XSPURIOUSINT_OFFSET);
345 	temp &= ~APIC_SVR_VECTOR;
346 	temp |= XSPURIOUSINT_OFFSET;
347 
348 	lapic->svr = temp;
349 
350 	/*
351 	 * Pump out a few EOIs to clean out interrupts that got through
352 	 * before we were able to set the TPR.
353 	 */
354 	lapic->eoi = 0;
355 	lapic->eoi = 0;
356 	lapic->eoi = 0;
357 
358 	if (bsp) {
359 		lapic_timer_calibrate();
360 		if (lapic_timer_enable) {
361 			if (cpu_thermal_feature & CPUID_THERMAL_ARAT) {
362 				/*
363 				 * Local APIC timer will not stop
364 				 * in deep C-state.
365 				 */
366 				lapic_cputimer_intr.caps |=
367 				    CPUTIMER_INTR_CAP_PS;
368 			}
369 			if (lapic_use_tscdeadline) {
370 				lapic_cputimer_intr.reload =
371 				    lapic_timer_tscdlt_reload;
372 			}
373 			cputimer_intr_register(&lapic_cputimer_intr);
374 			cputimer_intr_select(&lapic_cputimer_intr, 0);
375 		}
376 	} else if (!lapic_use_tscdeadline) {
377 		lapic_timer_set_divisor(lapic_timer_divisor_idx);
378 	}
379 
380 	if (bootverbose)
381 		apic_dump("apic_initialize()");
382 }
383 
384 static void
385 lapic_timer_set_divisor(int divisor_idx)
386 {
387 	KKASSERT(divisor_idx >= 0 && divisor_idx < APIC_TIMER_NDIVISORS);
388 	lapic->dcr_timer = lapic_timer_divisors[divisor_idx];
389 }
390 
391 static void
392 lapic_timer_oneshot(u_int count)
393 {
394 	uint32_t value;
395 
396 	value = lapic->lvt_timer;
397 	value &= ~(APIC_LVTT_PERIODIC | APIC_LVTT_TSCDLT);
398 	lapic->lvt_timer = value;
399 	lapic->icr_timer = count;
400 }
401 
402 static void
403 lapic_timer_oneshot_quick(u_int count)
404 {
405 	lapic->icr_timer = count;
406 }
407 
408 static void
409 lapic_timer_tscdeadline_quick(uint64_t diff)
410 {
411 	uint64_t val = rdtsc() + diff;
412 
413 	wrmsr(MSR_TSC_DEADLINE, val);
414 	tsc_deadlines[mycpuid].timestamp = val;
415 }
416 
417 static uint64_t
418 lapic_scale_to_tsc(unsigned value, unsigned scale)
419 {
420 	uint64_t val;
421 
422 	val = value;
423 	val *= tsc_frequency;
424 	val += (scale - 1);
425 	val /= scale;
426 	return val;
427 }
428 
429 #define MAX_MEASURE_RETRIES	100
430 
431 static u_int64_t
432 do_tsc_calibration(u_int us, u_int64_t apic_delay_tsc)
433 {
434 	u_int64_t old_tsc1, old_tsc2, new_tsc1, new_tsc2;
435 	u_int64_t diff, count;
436 	u_int64_t a;
437 	u_int32_t start, end;
438 	int retries1 = 0, retries2 = 0;
439 
440 retry1:
441 	lapic_timer_oneshot_quick(APIC_TIMER_MAX_COUNT);
442 	old_tsc1 = rdtsc_ordered();
443 	start = lapic->ccr_timer;
444 	old_tsc2 = rdtsc_ordered();
445 	if (apic_delay_tsc > 0 && retries1 < MAX_MEASURE_RETRIES &&
446 	    old_tsc2 - old_tsc1 > 2 * apic_delay_tsc) {
447 		retries1++;
448 		goto retry1;
449 	}
450 	DELAY(us);
451 retry2:
452 	new_tsc1 = rdtsc_ordered();
453 	end = lapic->ccr_timer;
454 	new_tsc2 = rdtsc_ordered();
455 	if (apic_delay_tsc > 0 && retries2 < MAX_MEASURE_RETRIES &&
456 	    new_tsc2 - new_tsc1 > 2 * apic_delay_tsc) {
457 		retries2++;
458 		goto retry2;
459 	}
460 	if (end == 0)
461 		return 0;
462 
463 	count = start - end;
464 
465 	/* Make sure the lapic can count for up to 2s */
466 	a = (unsigned)APIC_TIMER_MAX_COUNT;
467 	if (us < 2000000 && (u_int64_t)count * 2000000 >= a * us)
468 		return 0;
469 
470 	if (lapic_calibrate_test > 0 && (retries1 > 0 || retries2 > 0)) {
471 		kprintf("%s: retries1=%d retries2=%d\n",
472 		    __func__, retries1, retries2);
473 	}
474 
475 	diff = (new_tsc1 - old_tsc1) + (new_tsc2 - old_tsc2);
476 	/* XXX First estimate if the total TSC diff value makes sense */
477 	/* This will almost overflow, but only almost :) */
478 	count = (2 * count * tsc_frequency) / diff;
479 
480 	return count;
481 }
482 
483 static uint64_t
484 do_cputimer_calibration(u_int us)
485 {
486 	sysclock_t value;
487 	sysclock_t start, end, beginning, finish;
488 
489 	lapic_timer_oneshot(APIC_TIMER_MAX_COUNT);
490 	beginning = lapic->ccr_timer;
491 	start = sys_cputimer->count();
492 	DELAY(us);
493 	end = sys_cputimer->count();
494 	finish = lapic->ccr_timer;
495 	if (finish == 0)
496 		return 0;
497 	/* value is the LAPIC timer difference. */
498 	value = beginning - finish;
499 	/* end is the sys_cputimer difference. */
500 	end -= start;
501 	if (end == 0)
502 		return 0;
503 	value = ((uint64_t)value * sys_cputimer->freq) / end;
504 	return value;
505 }
506 
507 static void
508 lapic_timer_calibrate(void)
509 {
510 	sysclock_t value;
511 	u_int64_t apic_delay_tsc = 0;
512 	int use_tsc_calibration = 0;
513 
514 	/* No need to calibrate lapic_timer, if we will use TSC Deadline mode */
515 	if (lapic_use_tscdeadline) {
516 		lapic_timer_tscfreq_shift = 0;
517 		while ((tsc_frequency >> lapic_timer_tscfreq_shift) > INT_MAX)
518 			lapic_timer_tscfreq_shift++;
519 		lapic_cputimer_intr.freq =
520 		    tsc_frequency >> lapic_timer_tscfreq_shift;
521 		kprintf(
522 		    "lapic: TSC Deadline Mode: shift %d, frequency %u Hz\n",
523 		    lapic_timer_tscfreq_shift, lapic_cputimer_intr.freq);
524 		return;
525 	}
526 
527 	/*
528 	 * On real hardware, tsc_invariant == 0 wouldn't be an issue, but in
529 	 * a virtual machine the frequency may get changed by the host.
530 	 */
531 	if (tsc_frequency != 0 && tsc_invariant && lapic_calibrate_fast)
532 		use_tsc_calibration = 1;
533 
534 	if (use_tsc_calibration) {
535 		u_int64_t min_apic_tsc = 0, max_apic_tsc = 0;
536 		u_int64_t old_tsc, new_tsc;
537 		sysclock_t val;
538 		int i;
539 
540 		/* warm up */
541 		lapic_timer_oneshot(APIC_TIMER_MAX_COUNT);
542 		for (i = 0; i < 10; i++)
543 			val = lapic->ccr_timer;
544 
545 		for (i = 0; i < 100; i++) {
546 			old_tsc = rdtsc_ordered();
547 			val = lapic->ccr_timer;
548 			new_tsc = rdtsc_ordered();
549 			new_tsc -= old_tsc;
550 			apic_delay_tsc += new_tsc;
551 			if (min_apic_tsc == 0 ||
552 			    min_apic_tsc > new_tsc) {
553 				min_apic_tsc = new_tsc;
554 			}
555 			if (max_apic_tsc < new_tsc)
556 				max_apic_tsc = new_tsc;
557 		}
558 		apic_delay_tsc /= 100;
559 		kprintf(
560 		    "LAPIC latency (in TSC ticks): %lu min: %lu max: %lu\n",
561 		    apic_delay_tsc, min_apic_tsc, max_apic_tsc);
562 		apic_delay_tsc = min_apic_tsc;
563 	}
564 
565 	if (!use_tsc_calibration) {
566 		int i;
567 
568 		/*
569 		 * Do some exercising of the lapic timer access. This improves
570 		 * precision of the subsequent calibration run in at least some
571 		 * virtualization cases.
572 		 */
573 		lapic_timer_set_divisor(0);
574 		for (i = 0; i < 10; i++)
575 			(void)do_cputimer_calibration(100);
576 	}
577 	/* Try to calibrate the local APIC timer. */
578 	for (lapic_timer_divisor_idx = 0;
579 	     lapic_timer_divisor_idx < APIC_TIMER_NDIVISORS;
580 	     lapic_timer_divisor_idx++) {
581 		lapic_timer_set_divisor(lapic_timer_divisor_idx);
582 		if (use_tsc_calibration) {
583 			value = do_tsc_calibration(200*1000, apic_delay_tsc);
584 		} else {
585 			value = do_cputimer_calibration(2*1000*1000);
586 		}
587 		if (value != 0)
588 			break;
589 	}
590 	if (lapic_timer_divisor_idx >= APIC_TIMER_NDIVISORS)
591 		panic("lapic: no proper timer divisor?!");
592 	lapic_cputimer_intr.freq = value;
593 
594 	kprintf("lapic: divisor index %d, frequency %u Hz\n",
595 		lapic_timer_divisor_idx, lapic_cputimer_intr.freq);
596 
597 	if (lapic_calibrate_test > 0) {
598 		uint64_t freq;
599 		int i;
600 
601 		for (i = 1; i <= 20; i++) {
602 			if (use_tsc_calibration) {
603 				freq = do_tsc_calibration(i*100*1000,
604 				    apic_delay_tsc);
605 			} else {
606 				freq = do_cputimer_calibration(i*100*1000);
607 			}
608 			if (freq != 0)
609 				kprintf("%ums: %lu\n", i * 100, freq);
610 		}
611 	}
612 }
613 
614 static void
615 lapic_timer_tscdlt_reload(struct cputimer_intr *cti, sysclock_t reload)
616 {
617 	struct globaldata *gd = mycpu;
618 	uint64_t diff, now, val;
619 
620 	if (reload > 1000*1000*1000)
621 		reload = 1000*1000*1000;
622 	diff = (uint64_t)reload * tsc_frequency / sys_cputimer->freq;
623 	if (diff < 4)
624 		diff = 4;
625 	if (cpu_vendor_id == CPU_VENDOR_INTEL)
626 		cpu_lfence();
627 	else
628 		cpu_mfence();
629 	now = rdtsc();
630 	val = now + diff;
631 	if (gd->gd_timer_running) {
632 		uint64_t deadline = tsc_deadlines[mycpuid].timestamp;
633 		if (deadline == 0 || now > deadline || val < deadline) {
634 			wrmsr(MSR_TSC_DEADLINE, val);
635 			tsc_deadlines[mycpuid].timestamp = val;
636 		}
637 	} else {
638 		gd->gd_timer_running = 1;
639 		wrmsr(MSR_TSC_DEADLINE, val);
640 		tsc_deadlines[mycpuid].timestamp = val;
641 	}
642 }
643 
644 static void
645 lapic_timer_intr_reload(struct cputimer_intr *cti, sysclock_t reload)
646 {
647 	struct globaldata *gd = mycpu;
648 
649 	reload = (int64_t)reload * cti->freq / sys_cputimer->freq;
650 	if (reload < 2)
651 		reload = 2;
652 
653 	if (gd->gd_timer_running) {
654 		if (reload < lapic->ccr_timer)
655 			lapic_timer_oneshot_quick(reload);
656 	} else {
657 		gd->gd_timer_running = 1;
658 		lapic_timer_oneshot_quick(reload);
659 	}
660 }
661 
662 static void
663 lapic_timer_intr_enable(struct cputimer_intr *cti __unused)
664 {
665 	uint32_t timer;
666 
667 	timer = lapic->lvt_timer;
668 	timer &= ~(APIC_LVTT_MASKED | APIC_LVTT_PERIODIC | APIC_LVTT_TSCDLT);
669 	if (lapic_use_tscdeadline)
670 		timer |= APIC_LVTT_TSCDLT;
671 	lapic->lvt_timer = timer;
672 	if (lapic_use_tscdeadline)
673 		cpu_mfence();
674 
675 	lapic_timer_fixup_handler(NULL);
676 }
677 
678 static void
679 lapic_timer_fixup_handler(void *arg)
680 {
681 	int *started = arg;
682 
683 	if (started != NULL)
684 		*started = 0;
685 
686 	if (cpu_vendor_id == CPU_VENDOR_AMD) {
687 		/*
688 		 * Detect the presence of C1E capability mostly on latest
689 		 * dual-cores (or future) k8 family.  This feature renders
690 		 * the local APIC timer dead, so we disable it by reading
691 		 * the Interrupt Pending Message register and clearing both
692 		 * C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
693 		 *
694 		 * Reference:
695 		 *   "BIOS and Kernel Developer's Guide for AMD NPT
696 		 *    Family 0Fh Processors"
697 		 *   #32559 revision 3.00
698 		 */
699 		if ((cpu_id & 0x00000f00) == 0x00000f00 &&
700 		    (cpu_id & 0x0fff0000) >= 0x00040000) {
701 			uint64_t msr;
702 
703 			msr = rdmsr(0xc0010055);
704 			if (msr & 0x18000000) {
705 				struct globaldata *gd = mycpu;
706 
707 				kprintf("cpu%d: AMD C1E detected\n",
708 					gd->gd_cpuid);
709 				wrmsr(0xc0010055, msr & ~0x18000000ULL);
710 
711 				/*
712 				 * We are kinda stalled;
713 				 * kick start again.
714 				 */
715 				gd->gd_timer_running = 1;
716 				if (lapic_use_tscdeadline) {
717 					/* Maybe reached in Virtual Machines? */
718 					lapic_timer_tscdeadline_quick(5000);
719 				} else {
720 					lapic_timer_oneshot_quick(2);
721 				}
722 
723 				if (started != NULL)
724 					*started = 1;
725 			}
726 		}
727 	}
728 }
729 
730 static void
731 lapic_timer_restart_handler(void *dummy __unused)
732 {
733 	int started;
734 
735 	lapic_timer_fixup_handler(&started);
736 	if (!started) {
737 		struct globaldata *gd = mycpu;
738 
739 		gd->gd_timer_running = 1;
740 		if (lapic_use_tscdeadline) {
741 			/* Maybe reached in Virtual Machines? */
742 			lapic_timer_tscdeadline_quick(5000);
743 		} else {
744 			lapic_timer_oneshot_quick(2);
745 		}
746 	}
747 }
748 
749 /*
750  * This function is called only by ACPICA code currently:
751  * - AMD C1E fixup.  AMD C1E only seems to happen after ACPI
752  *   module controls PM.  So once ACPICA is attached, we try
753  *   to apply the fixup to prevent LAPIC timer from hanging.
754  */
755 static void
756 lapic_timer_intr_pmfixup(struct cputimer_intr *cti __unused)
757 {
758 	lwkt_send_ipiq_mask(smp_active_mask,
759 			    lapic_timer_fixup_handler, NULL);
760 }
761 
762 static void
763 lapic_timer_intr_restart(struct cputimer_intr *cti __unused)
764 {
765 	lwkt_send_ipiq_mask(smp_active_mask, lapic_timer_restart_handler, NULL);
766 }
767 
768 
769 /*
770  * dump contents of local APIC registers
771  */
772 void
773 apic_dump(char* str)
774 {
775 	kprintf("SMP: CPU%d %s:\n", mycpu->gd_cpuid, str);
776 	kprintf("     lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
777 		lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr);
778 }
779 
780 /*
781  * Inter Processor Interrupt functions.
782  */
783 
784 /*
785  * Send APIC IPI 'vector' to 'destType' via 'deliveryMode'.
786  *
787  *  destType is 1 of: APIC_DEST_SELF, APIC_DEST_ALLISELF, APIC_DEST_ALLESELF
788  *  vector is any valid SYSTEM INT vector
789  *  delivery_mode is 1 of: APIC_DELMODE_FIXED, APIC_DELMODE_LOWPRIO
790  *
791  * WARNINGS!
792  *
793  * We now implement a per-cpu interlock (gd->gd_npoll) to prevent more than
794  * one IPI from being sent to any given cpu at a time.  Thus we no longer
795  * have to process incoming IPIs while waiting for the status to clear.
796  * No deadlock should be possible.
797  *
798  * We now physically disable interrupts for the lapic ICR operation.  If
799  * we do not do this then it looks like an EOI sent to the lapic (which
800  * occurs even with a critical section) can interfere with the command
801  * register ready status and cause an IPI to be lost.
802  *
803  * e.g. an interrupt can occur, issue the EOI, IRET, and cause the command
804  * register to busy just before we write to icr_lo, resulting in a lost
805  * issuance.  This only appears to occur on Intel cpus and is not
806  * documented.  It could simply be that cpus are so fast these days that
807  * it was always an issue, but is only now rearing its ugly head.  This
808  * is conjecture.
809  */
810 int
811 apic_ipi(int dest_type, int vector, int delivery_mode)
812 {
813 	uint32_t icr_hi;
814 	uint32_t icr_lo;
815 	int64_t tsc;
816 	int loops = 1;
817 
818 	if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
819 		tsc = rdtsc();
820 		while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
821 			cpu_pause();
822 			if ((tsc_sclock_t)(rdtsc() -
823 					   (tsc + tsc_frequency)) > 0) {
824 				kprintf("apic_ipi stall cpu %d (sing)\n",
825 					mycpuid);
826 				tsc = rdtsc();
827 				if (++loops > 30)
828 					panic("apic stall");
829 			}
830 		}
831 	}
832 	icr_hi = lapic->icr_hi & ~APIC_ID_MASK;
833 	icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) | dest_type |
834 		 APIC_LEVEL_ASSERT | delivery_mode | vector;
835 	lapic->icr_hi = icr_hi;
836 	lapic->icr_lo = icr_lo;
837 
838 	return 0;
839 }
840 
841 /*
842  * Interrupts must be hard-disabled by caller
843  */
844 void
845 single_apic_ipi(int cpu, int vector, int delivery_mode)
846 {
847 	uint32_t  icr_lo;
848 	uint32_t  icr_hi;
849 	int64_t tsc;
850 	int loops = 1;
851 
852 	if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
853 		tsc = rdtsc();
854 		while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
855 			cpu_pause();
856 			if ((tsc_sclock_t)(rdtsc() -
857 					   (tsc + tsc_frequency)) > 0) {
858 				kprintf("single_apic_ipi stall cpu %d (sing)\n",
859 					mycpuid);
860 				tsc = rdtsc();
861 				if (++loops > 30)
862 					panic("apic stall");
863 			}
864 		}
865 	}
866 	icr_hi = lapic->icr_hi & ~APIC_ID_MASK;
867 	icr_hi |= (CPUID_TO_APICID(cpu) << 24);
868 
869 	/* build ICR_LOW */
870 	icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) |
871 		 APIC_LEVEL_ASSERT | APIC_DEST_DESTFLD | delivery_mode | vector;
872 
873 	/* write APIC ICR */
874 	lapic->icr_hi = icr_hi;
875 	lapic->icr_lo = icr_lo;
876 }
877 
878 #if 0
879 
880 /*
881  * Returns 0 if the apic is busy, 1 if we were able to queue the request.
882  *
883  * NOT WORKING YET!  The code as-is may end up not queueing an IPI at all
884  * to the target, and the scheduler does not 'poll' for IPI messages.
885  */
886 int
887 single_apic_ipi_passive(int cpu, int vector, int delivery_mode)
888 {
889 	u_long  icr_lo;
890 	u_long  icr_hi;
891 	unsigned long rflags;
892 
893 	rflags = read_rflags();
894 	cpu_disable_intr();
895 	if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
896 		write_rflags(rflags);
897 		return(0);
898 	}
899 	icr_hi = lapic->icr_hi & ~APIC_ID_MASK;
900 	icr_hi |= (CPUID_TO_APICID(cpu) << 24);
901 	lapic->icr_hi = icr_hi;
902 
903 	/* build IRC_LOW */
904 	icr_lo = (lapic->icr_lo & APIC_RESV2_MASK) |
905 		 APIC_DEST_DESTFLD | delivery_mode | vector;
906 
907 	/* write APIC ICR */
908 	lapic->icr_lo = icr_lo;
909 	write_rflags(rflags);
910 
911 	return(1);
912 }
913 
914 #endif
915 
916 /*
917  * Send APIC IPI 'vector' to 'target's via 'delivery_mode'.
918  *
919  * target is a bitmask of destination cpus.  Vector is any
920  * valid system INT vector.  Delivery mode may be either
921  * APIC_DELMODE_FIXED or APIC_DELMODE_LOWPRIO.
922  *
923  * Interrupts must be hard-disabled by caller
924  */
925 void
926 selected_apic_ipi(cpumask_t target, int vector, int delivery_mode)
927 {
928 	while (CPUMASK_TESTNZERO(target)) {
929 		int n = BSFCPUMASK(target);
930 		CPUMASK_NANDBIT(target, n);
931 		single_apic_ipi(n, vector, delivery_mode);
932 	}
933 }
934 
935 /*
936  * Load a 'downcount time' in uSeconds.
937  */
938 void
939 set_apic_timer(int us)
940 {
941 	u_int count;
942 
943 	if (lapic_use_tscdeadline) {
944 		uint64_t val;
945 
946 		val = lapic_scale_to_tsc(us, 1000000);
947 		val += rdtsc();
948 		/* No need to arm the lapic here, just track the timeout. */
949 		tsc_deadlines[mycpuid].downcount_time = val;
950 		return;
951 	}
952 
953 	/*
954 	 * When we reach here, lapic timer's frequency
955 	 * must have been calculated as well as the
956 	 * divisor (lapic->dcr_timer is setup during the
957 	 * divisor calculation).
958 	 */
959 	KKASSERT(lapic_cputimer_intr.freq != 0 &&
960 		 lapic_timer_divisor_idx >= 0);
961 
962 	count = ((us * (int64_t)lapic_cputimer_intr.freq) + 999999) / 1000000;
963 	lapic_timer_oneshot(count);
964 }
965 
966 
967 /*
968  * Read remaining time in timer, in microseconds (rounded up).
969  */
970 int
971 read_apic_timer(void)
972 {
973 	uint64_t val;
974 
975 	if (lapic_use_tscdeadline) {
976 		uint64_t now;
977 
978 		val = tsc_deadlines[mycpuid].downcount_time;
979 		now = rdtsc();
980 		if (val == 0 || now > val) {
981 			return 0;
982 		} else {
983 			val -= now;
984 			val *= 1000000;
985 			val += (tsc_frequency - 1);
986 			val /= tsc_frequency;
987 			if (val > INT_MAX)
988 				val = INT_MAX;
989 			return val;
990 		}
991 	}
992 
993 	val = lapic->ccr_timer;
994 	if (val == 0)
995 		return 0;
996 
997 	KKASSERT(lapic_cputimer_intr.freq > 0);
998 	val *= 1000000;
999 	val += (lapic_cputimer_intr.freq - 1);
1000 	val /= lapic_cputimer_intr.freq;
1001 	if (val > INT_MAX)
1002 		val = INT_MAX;
1003 	return val;
1004 }
1005 
1006 
1007 /*
1008  * Spin-style delay, set delay time in uS, spin till it drains.
1009  */
1010 void
1011 u_sleep(int count)
1012 {
1013 	set_apic_timer(count);
1014 	while (read_apic_timer())
1015 		 /* spin */ ;
1016 }
1017 
1018 int
1019 lapic_unused_apic_id(int start)
1020 {
1021 	int i;
1022 
1023 	for (i = start; i < APICID_MAX; ++i) {
1024 		if (APICID_TO_CPUID(i) == -1)
1025 			return i;
1026 	}
1027 	return NAPICID;
1028 }
1029 
1030 void
1031 lapic_map(vm_paddr_t lapic_addr)
1032 {
1033 	lapic = pmap_mapdev_uncacheable(lapic_addr, sizeof(struct LAPIC));
1034 }
1035 
1036 static TAILQ_HEAD(, lapic_enumerator) lapic_enumerators =
1037 	TAILQ_HEAD_INITIALIZER(lapic_enumerators);
1038 
1039 int
1040 lapic_config(void)
1041 {
1042 	struct lapic_enumerator *e;
1043 	int error, i, ap_max;
1044 
1045 	KKASSERT(lapic_enable);
1046 
1047 	for (i = 0; i < NAPICID; ++i)
1048 		APICID_TO_CPUID(i) = -1;
1049 
1050 	TAILQ_FOREACH(e, &lapic_enumerators, lapic_link) {
1051 		error = e->lapic_probe(e);
1052 		if (!error)
1053 			break;
1054 	}
1055 	if (e == NULL) {
1056 		kprintf("LAPIC: Can't find LAPIC\n");
1057 		return ENXIO;
1058 	}
1059 
1060 	error = e->lapic_enumerate(e);
1061 	if (error) {
1062 		kprintf("LAPIC: enumeration failed\n");
1063 		return ENXIO;
1064 	}
1065 
1066 	ap_max = MAXCPU - 1;
1067 	TUNABLE_INT_FETCH("hw.ap_max", &ap_max);
1068 	if (ap_max > MAXCPU - 1)
1069 		ap_max = MAXCPU - 1;
1070 
1071 	if (naps > ap_max) {
1072 		kprintf("LAPIC: Warning use only %d out of %d "
1073 			"available APs\n",
1074 			ap_max, naps);
1075 		naps = ap_max;
1076 	}
1077 
1078 	return 0;
1079 }
1080 
1081 void
1082 lapic_enumerator_register(struct lapic_enumerator *ne)
1083 {
1084 	struct lapic_enumerator *e;
1085 
1086 	TAILQ_FOREACH(e, &lapic_enumerators, lapic_link) {
1087 		if (e->lapic_prio < ne->lapic_prio) {
1088 			TAILQ_INSERT_BEFORE(e, ne, lapic_link);
1089 			return;
1090 		}
1091 	}
1092 	TAILQ_INSERT_TAIL(&lapic_enumerators, ne, lapic_link);
1093 }
1094 
1095 void
1096 lapic_set_cpuid(int cpu_id, int apic_id)
1097 {
1098 	CPUID_TO_APICID(cpu_id) = apic_id;
1099 	APICID_TO_CPUID(apic_id) = cpu_id;
1100 }
1101 
1102 void
1103 lapic_fixup_noioapic(void)
1104 {
1105 	u_int   temp;
1106 
1107 	/* Only allowed on BSP */
1108 	KKASSERT(mycpuid == 0);
1109 	KKASSERT(!ioapic_enable);
1110 
1111 	temp = lapic->lvt_lint0;
1112 	temp &= ~APIC_LVT_MASKED;
1113 	lapic->lvt_lint0 = temp;
1114 
1115 	temp = lapic->lvt_lint1;
1116 	temp |= APIC_LVT_MASKED;
1117 	lapic->lvt_lint1 = temp;
1118 }
1119 
1120 static void
1121 lapic_sysinit(void *dummy __unused)
1122 {
1123 	if (lapic_enable) {
1124 		int error;
1125 
1126 		error = lapic_config();
1127 		if (error)
1128 			lapic_enable = 0;
1129 	}
1130 
1131 	if (lapic_enable) {
1132 		/* Initialize BSP's local APIC */
1133 		lapic_init(TRUE);
1134 	} else if (ioapic_enable) {
1135 		ioapic_enable = 0;
1136 		icu_reinit_noioapic();
1137 	}
1138 }
1139 SYSINIT(lapic, SI_BOOT2_LAPIC, SI_ORDER_FIRST, lapic_sysinit, NULL);
1140