xref: /freebsd/sys/x86/x86/intr_machdep.c (revision 42249ef2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * Machine dependent interrupt code for x86.  For x86, we have to
32  * deal with different PICs.  Thus, we use the passed in vector to lookup
33  * an interrupt source associated with that vector.  The interrupt source
34  * describes which PIC the source belongs to and includes methods to handle
35  * that source.
36  */
37 
38 #include "opt_atpic.h"
39 #include "opt_ddb.h"
40 #include "opt_smp.h"
41 
42 #include <sys/param.h>
43 #include <sys/bus.h>
44 #include <sys/interrupt.h>
45 #include <sys/ktr.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/mutex.h>
50 #include <sys/proc.h>
51 #include <sys/queue.h>
52 #include <sys/sbuf.h>
53 #include <sys/smp.h>
54 #include <sys/sx.h>
55 #include <sys/sysctl.h>
56 #include <sys/syslog.h>
57 #include <sys/systm.h>
58 #include <sys/taskqueue.h>
59 #include <sys/vmmeter.h>
60 #include <machine/clock.h>
61 #include <machine/intr_machdep.h>
62 #include <machine/smp.h>
63 #ifdef DDB
64 #include <ddb/ddb.h>
65 #endif
66 
67 #ifndef DEV_ATPIC
68 #include <machine/segments.h>
69 #include <machine/frame.h>
70 #include <dev/ic/i8259.h>
71 #include <x86/isa/icu.h>
72 #include <isa/isareg.h>
73 #endif
74 
75 #include <vm/vm.h>
76 
77 #define	MAX_STRAY_LOG	5
78 
79 typedef void (*mask_fn)(void *);
80 
81 static int intrcnt_index;
82 static struct intsrc **interrupt_sources;
83 #ifdef SMP
84 static struct intsrc **interrupt_sorted;
85 static int intrbalance;
86 SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RW, &intrbalance, 0,
87     "Interrupt auto-balance interval (seconds).  Zero disables.");
88 static struct timeout_task intrbalance_task;
89 #endif
90 static struct sx intrsrc_lock;
91 static struct mtx intrpic_lock;
92 static struct mtx intrcnt_lock;
93 static TAILQ_HEAD(pics_head, pic) pics;
94 u_int num_io_irqs;
95 
96 #if defined(SMP) && !defined(EARLY_AP_STARTUP)
97 static int assign_cpu;
98 #endif
99 
100 u_long *intrcnt;
101 char *intrnames;
102 size_t sintrcnt = sizeof(intrcnt);
103 size_t sintrnames = sizeof(intrnames);
104 int nintrcnt;
105 
106 static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources");
107 
108 static int	intr_assign_cpu(void *arg, int cpu);
109 static void	intr_disable_src(void *arg);
110 static void	intr_init(void *__dummy);
111 static int	intr_pic_registered(struct pic *pic);
112 static void	intrcnt_setname(const char *name, int index);
113 static void	intrcnt_updatename(struct intsrc *is);
114 static void	intrcnt_register(struct intsrc *is);
115 
116 /*
117  * SYSINIT levels for SI_SUB_INTR:
118  *
119  * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init
120  * SI_ORDER_SECOND: Xen PICs
121  * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges
122  * SI_ORDER_FOURTH: Add 8259A PICs
123  * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources
124  * SI_ORDER_MIDDLE: SMP interrupt counters
125  * SI_ORDER_ANY: Enable interrupts on BSP
126  */
127 
128 static int
129 intr_pic_registered(struct pic *pic)
130 {
131 	struct pic *p;
132 
133 	TAILQ_FOREACH(p, &pics, pics) {
134 		if (p == pic)
135 			return (1);
136 	}
137 	return (0);
138 }
139 
140 /*
141  * Register a new interrupt controller (PIC).  This is to support suspend
142  * and resume where we suspend/resume controllers rather than individual
143  * sources.  This also allows controllers with no active sources (such as
144  * 8259As in a system using the APICs) to participate in suspend and resume.
145  */
146 int
147 intr_register_pic(struct pic *pic)
148 {
149 	int error;
150 
151 	mtx_lock(&intrpic_lock);
152 	if (intr_pic_registered(pic))
153 		error = EBUSY;
154 	else {
155 		TAILQ_INSERT_TAIL(&pics, pic, pics);
156 		error = 0;
157 	}
158 	mtx_unlock(&intrpic_lock);
159 	return (error);
160 }
161 
162 /*
163  * Allocate interrupt source arrays and register interrupt sources
164  * once the number of interrupts is known.
165  */
166 static void
167 intr_init_sources(void *arg)
168 {
169 	struct pic *pic;
170 
171 	MPASS(num_io_irqs > 0);
172 
173 	interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources),
174 	    M_INTR, M_WAITOK | M_ZERO);
175 #ifdef SMP
176 	interrupt_sorted = mallocarray(num_io_irqs, sizeof(*interrupt_sorted),
177 	    M_INTR, M_WAITOK | M_ZERO);
178 #endif
179 
180 	/*
181 	 * - 1 ??? dummy counter.
182 	 * - 2 counters for each I/O interrupt.
183 	 * - 1 counter for each CPU for lapic timer.
184 	 * - 1 counter for each CPU for the Hyper-V vmbus driver.
185 	 * - 8 counters for each CPU for IPI counters for SMP.
186 	 */
187 	nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2;
188 #ifdef COUNT_IPIS
189 	if (mp_ncpus > 1)
190 		nintrcnt += 8 * mp_ncpus;
191 #endif
192 	intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK |
193 	    M_ZERO);
194 	intrnames = mallocarray(nintrcnt, MAXCOMLEN + 1, M_INTR, M_WAITOK |
195 	    M_ZERO);
196 	sintrcnt = nintrcnt * sizeof(u_long);
197 	sintrnames = nintrcnt * (MAXCOMLEN + 1);
198 
199 	intrcnt_setname("???", 0);
200 	intrcnt_index = 1;
201 
202 	/*
203 	 * NB: intrpic_lock is not held here to avoid LORs due to
204 	 * malloc() in intr_register_source().  However, we are still
205 	 * single-threaded at this point in startup so the list of
206 	 * PICs shouldn't change.
207 	 */
208 	TAILQ_FOREACH(pic, &pics, pics) {
209 		if (pic->pic_register_sources != NULL)
210 			pic->pic_register_sources(pic);
211 	}
212 }
213 SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources,
214     NULL);
215 
216 /*
217  * Register a new interrupt source with the global interrupt system.
218  * The global interrupts need to be disabled when this function is
219  * called.
220  */
221 int
222 intr_register_source(struct intsrc *isrc)
223 {
224 	int error, vector;
225 
226 	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
227 	vector = isrc->is_pic->pic_vector(isrc);
228 	KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector,
229 	    num_io_irqs));
230 	if (interrupt_sources[vector] != NULL)
231 		return (EEXIST);
232 	error = intr_event_create(&isrc->is_event, isrc, 0, vector,
233 	    intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source,
234 	    (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:",
235 	    vector);
236 	if (error)
237 		return (error);
238 	sx_xlock(&intrsrc_lock);
239 	if (interrupt_sources[vector] != NULL) {
240 		sx_xunlock(&intrsrc_lock);
241 		intr_event_destroy(isrc->is_event);
242 		return (EEXIST);
243 	}
244 	intrcnt_register(isrc);
245 	interrupt_sources[vector] = isrc;
246 	isrc->is_handlers = 0;
247 	sx_xunlock(&intrsrc_lock);
248 	return (0);
249 }
250 
251 struct intsrc *
252 intr_lookup_source(int vector)
253 {
254 
255 	if (vector < 0 || vector >= num_io_irqs)
256 		return (NULL);
257 	return (interrupt_sources[vector]);
258 }
259 
260 int
261 intr_add_handler(const char *name, int vector, driver_filter_t filter,
262     driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
263     int domain)
264 {
265 	struct intsrc *isrc;
266 	int error;
267 
268 	isrc = intr_lookup_source(vector);
269 	if (isrc == NULL)
270 		return (EINVAL);
271 	error = intr_event_add_handler(isrc->is_event, name, filter, handler,
272 	    arg, intr_priority(flags), flags, cookiep);
273 	if (error == 0) {
274 		sx_xlock(&intrsrc_lock);
275 		intrcnt_updatename(isrc);
276 		isrc->is_handlers++;
277 		if (isrc->is_handlers == 1) {
278 			isrc->is_domain = domain;
279 			isrc->is_pic->pic_enable_intr(isrc);
280 			isrc->is_pic->pic_enable_source(isrc);
281 		}
282 		sx_xunlock(&intrsrc_lock);
283 	}
284 	return (error);
285 }
286 
287 int
288 intr_remove_handler(void *cookie)
289 {
290 	struct intsrc *isrc;
291 	int error;
292 
293 	isrc = intr_handler_source(cookie);
294 	error = intr_event_remove_handler(cookie);
295 	if (error == 0) {
296 		sx_xlock(&intrsrc_lock);
297 		isrc->is_handlers--;
298 		if (isrc->is_handlers == 0) {
299 			isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
300 			isrc->is_pic->pic_disable_intr(isrc);
301 		}
302 		intrcnt_updatename(isrc);
303 		sx_xunlock(&intrsrc_lock);
304 	}
305 	return (error);
306 }
307 
308 int
309 intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol)
310 {
311 	struct intsrc *isrc;
312 
313 	isrc = intr_lookup_source(vector);
314 	if (isrc == NULL)
315 		return (EINVAL);
316 	return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
317 }
318 
319 static void
320 intr_disable_src(void *arg)
321 {
322 	struct intsrc *isrc;
323 
324 	isrc = arg;
325 	isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
326 }
327 
328 void
329 intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
330 {
331 	struct intr_event *ie;
332 	int vector;
333 
334 	/*
335 	 * We count software interrupts when we process them.  The
336 	 * code here follows previous practice, but there's an
337 	 * argument for counting hardware interrupts when they're
338 	 * processed too.
339 	 */
340 	(*isrc->is_count)++;
341 	VM_CNT_INC(v_intr);
342 
343 	ie = isrc->is_event;
344 
345 	/*
346 	 * XXX: We assume that IRQ 0 is only used for the ISA timer
347 	 * device (clk).
348 	 */
349 	vector = isrc->is_pic->pic_vector(isrc);
350 	if (vector == 0)
351 		clkintr_pending = 1;
352 
353 	/*
354 	 * For stray interrupts, mask and EOI the source, bump the
355 	 * stray count, and log the condition.
356 	 */
357 	if (intr_event_handle(ie, frame) != 0) {
358 		isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
359 		(*isrc->is_straycount)++;
360 		if (*isrc->is_straycount < MAX_STRAY_LOG)
361 			log(LOG_ERR, "stray irq%d\n", vector);
362 		else if (*isrc->is_straycount == MAX_STRAY_LOG)
363 			log(LOG_CRIT,
364 			    "too many stray irq %d's: not logging anymore\n",
365 			    vector);
366 	}
367 }
368 
369 void
370 intr_resume(bool suspend_cancelled)
371 {
372 	struct pic *pic;
373 
374 #ifndef DEV_ATPIC
375 	atpic_reset();
376 #endif
377 	mtx_lock(&intrpic_lock);
378 	TAILQ_FOREACH(pic, &pics, pics) {
379 		if (pic->pic_resume != NULL)
380 			pic->pic_resume(pic, suspend_cancelled);
381 	}
382 	mtx_unlock(&intrpic_lock);
383 }
384 
385 void
386 intr_suspend(void)
387 {
388 	struct pic *pic;
389 
390 	mtx_lock(&intrpic_lock);
391 	TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) {
392 		if (pic->pic_suspend != NULL)
393 			pic->pic_suspend(pic);
394 	}
395 	mtx_unlock(&intrpic_lock);
396 }
397 
398 static int
399 intr_assign_cpu(void *arg, int cpu)
400 {
401 #ifdef SMP
402 	struct intsrc *isrc;
403 	int error;
404 
405 #ifdef EARLY_AP_STARTUP
406 	MPASS(mp_ncpus == 1 || smp_started);
407 
408 	/* Nothing to do if there is only a single CPU. */
409 	if (mp_ncpus > 1 && cpu != NOCPU) {
410 #else
411 	/*
412 	 * Don't do anything during early boot.  We will pick up the
413 	 * assignment once the APs are started.
414 	 */
415 	if (assign_cpu && cpu != NOCPU) {
416 #endif
417 		isrc = arg;
418 		sx_xlock(&intrsrc_lock);
419 		error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
420 		if (error == 0)
421 			isrc->is_cpu = cpu;
422 		sx_xunlock(&intrsrc_lock);
423 	} else
424 		error = 0;
425 	return (error);
426 #else
427 	return (EOPNOTSUPP);
428 #endif
429 }
430 
431 static void
432 intrcnt_setname(const char *name, int index)
433 {
434 
435 	snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s",
436 	    MAXCOMLEN, name);
437 }
438 
439 static void
440 intrcnt_updatename(struct intsrc *is)
441 {
442 
443 	intrcnt_setname(is->is_event->ie_fullname, is->is_index);
444 }
445 
446 static void
447 intrcnt_register(struct intsrc *is)
448 {
449 	char straystr[MAXCOMLEN + 1];
450 
451 	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
452 	mtx_lock_spin(&intrcnt_lock);
453 	MPASS(intrcnt_index + 2 <= nintrcnt);
454 	is->is_index = intrcnt_index;
455 	intrcnt_index += 2;
456 	snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
457 	    is->is_pic->pic_vector(is));
458 	intrcnt_updatename(is);
459 	is->is_count = &intrcnt[is->is_index];
460 	intrcnt_setname(straystr, is->is_index + 1);
461 	is->is_straycount = &intrcnt[is->is_index + 1];
462 	mtx_unlock_spin(&intrcnt_lock);
463 }
464 
465 void
466 intrcnt_add(const char *name, u_long **countp)
467 {
468 
469 	mtx_lock_spin(&intrcnt_lock);
470 	MPASS(intrcnt_index < nintrcnt);
471 	*countp = &intrcnt[intrcnt_index];
472 	intrcnt_setname(name, intrcnt_index);
473 	intrcnt_index++;
474 	mtx_unlock_spin(&intrcnt_lock);
475 }
476 
477 static void
478 intr_init(void *dummy __unused)
479 {
480 
481 	TAILQ_INIT(&pics);
482 	mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
483 	sx_init(&intrsrc_lock, "intrsrc");
484 	mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
485 }
486 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
487 
488 static void
489 intr_init_final(void *dummy __unused)
490 {
491 
492 	/*
493 	 * Enable interrupts on the BSP after all of the interrupt
494 	 * controllers are initialized.  Device interrupts are still
495 	 * disabled in the interrupt controllers until interrupt
496 	 * handlers are registered.  Interrupts are enabled on each AP
497 	 * after their first context switch.
498 	 */
499 	enable_intr();
500 }
501 SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
502 
503 #ifndef DEV_ATPIC
504 /* Initialize the two 8259A's to a known-good shutdown state. */
505 void
506 atpic_reset(void)
507 {
508 
509 	outb(IO_ICU1, ICW1_RESET | ICW1_IC4);
510 	outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS);
511 	outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID));
512 	outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE);
513 	outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
514 	outb(IO_ICU1, OCW3_SEL | OCW3_RR);
515 
516 	outb(IO_ICU2, ICW1_RESET | ICW1_IC4);
517 	outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8);
518 	outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID);
519 	outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE);
520 	outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
521 	outb(IO_ICU2, OCW3_SEL | OCW3_RR);
522 }
523 #endif
524 
525 /* Add a description to an active interrupt handler. */
526 int
527 intr_describe(u_int vector, void *ih, const char *descr)
528 {
529 	struct intsrc *isrc;
530 	int error;
531 
532 	isrc = intr_lookup_source(vector);
533 	if (isrc == NULL)
534 		return (EINVAL);
535 	error = intr_event_describe_handler(isrc->is_event, ih, descr);
536 	if (error)
537 		return (error);
538 	intrcnt_updatename(isrc);
539 	return (0);
540 }
541 
542 void
543 intr_reprogram(void)
544 {
545 	struct intsrc *is;
546 	u_int v;
547 
548 	sx_xlock(&intrsrc_lock);
549 	for (v = 0; v < num_io_irqs; v++) {
550 		is = interrupt_sources[v];
551 		if (is == NULL)
552 			continue;
553 		if (is->is_pic->pic_reprogram_pin != NULL)
554 			is->is_pic->pic_reprogram_pin(is);
555 	}
556 	sx_xunlock(&intrsrc_lock);
557 }
558 
559 #ifdef DDB
560 /*
561  * Dump data about interrupt handlers
562  */
563 DB_SHOW_COMMAND(irqs, db_show_irqs)
564 {
565 	struct intsrc **isrc;
566 	u_int i;
567 	int verbose;
568 
569 	if (strcmp(modif, "v") == 0)
570 		verbose = 1;
571 	else
572 		verbose = 0;
573 	isrc = interrupt_sources;
574 	for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++)
575 		if (*isrc != NULL)
576 			db_dump_intr_event((*isrc)->is_event, verbose);
577 }
578 #endif
579 
580 #ifdef SMP
581 /*
582  * Support for balancing interrupt sources across CPUs.  For now we just
583  * allocate CPUs round-robin.
584  */
585 
586 cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
587 static int current_cpu[MAXMEMDOM];
588 
589 static void
590 intr_init_cpus(void)
591 {
592 	int i;
593 
594 	for (i = 0; i < vm_ndomains; i++) {
595 		current_cpu[i] = 0;
596 		if (!CPU_ISSET(current_cpu[i], &intr_cpus) ||
597 		    !CPU_ISSET(current_cpu[i], &cpuset_domain[i]))
598 			intr_next_cpu(i);
599 	}
600 }
601 
602 /*
603  * Return the CPU that the next interrupt source should use.  For now
604  * this just returns the next local APIC according to round-robin.
605  */
606 u_int
607 intr_next_cpu(int domain)
608 {
609 	u_int apic_id;
610 
611 #ifdef EARLY_AP_STARTUP
612 	MPASS(mp_ncpus == 1 || smp_started);
613 	if (mp_ncpus == 1)
614 		return (PCPU_GET(apic_id));
615 #else
616 	/* Leave all interrupts on the BSP during boot. */
617 	if (!assign_cpu)
618 		return (PCPU_GET(apic_id));
619 #endif
620 
621 	mtx_lock_spin(&icu_lock);
622 	apic_id = cpu_apic_ids[current_cpu[domain]];
623 	do {
624 		current_cpu[domain]++;
625 		if (current_cpu[domain] > mp_maxid)
626 			current_cpu[domain] = 0;
627 	} while (!CPU_ISSET(current_cpu[domain], &intr_cpus) ||
628 	    !CPU_ISSET(current_cpu[domain], &cpuset_domain[domain]));
629 	mtx_unlock_spin(&icu_lock);
630 	return (apic_id);
631 }
632 
633 /* Attempt to bind the specified IRQ to the specified CPU. */
634 int
635 intr_bind(u_int vector, u_char cpu)
636 {
637 	struct intsrc *isrc;
638 
639 	isrc = intr_lookup_source(vector);
640 	if (isrc == NULL)
641 		return (EINVAL);
642 	return (intr_event_bind(isrc->is_event, cpu));
643 }
644 
645 /*
646  * Add a CPU to our mask of valid CPUs that can be destinations of
647  * interrupts.
648  */
649 void
650 intr_add_cpu(u_int cpu)
651 {
652 
653 	if (cpu >= MAXCPU)
654 		panic("%s: Invalid CPU ID", __func__);
655 	if (bootverbose)
656 		printf("INTR: Adding local APIC %d as a target\n",
657 		    cpu_apic_ids[cpu]);
658 
659 	CPU_SET(cpu, &intr_cpus);
660 }
661 
662 #ifdef EARLY_AP_STARTUP
663 static void
664 intr_smp_startup(void *arg __unused)
665 {
666 
667 	intr_init_cpus();
668 	return;
669 }
670 SYSINIT(intr_smp_startup, SI_SUB_SMP, SI_ORDER_SECOND, intr_smp_startup,
671     NULL);
672 
673 #else
674 /*
675  * Distribute all the interrupt sources among the available CPUs once the
676  * AP's have been launched.
677  */
678 static void
679 intr_shuffle_irqs(void *arg __unused)
680 {
681 	struct intsrc *isrc;
682 	u_int cpu, i;
683 
684 	intr_init_cpus();
685 	/* Don't bother on UP. */
686 	if (mp_ncpus == 1)
687 		return;
688 
689 	/* Round-robin assign a CPU to each enabled source. */
690 	sx_xlock(&intrsrc_lock);
691 	assign_cpu = 1;
692 	for (i = 0; i < num_io_irqs; i++) {
693 		isrc = interrupt_sources[i];
694 		if (isrc != NULL && isrc->is_handlers > 0) {
695 			/*
696 			 * If this event is already bound to a CPU,
697 			 * then assign the source to that CPU instead
698 			 * of picking one via round-robin.  Note that
699 			 * this is careful to only advance the
700 			 * round-robin if the CPU assignment succeeds.
701 			 */
702 			cpu = isrc->is_event->ie_cpu;
703 			if (cpu == NOCPU)
704 				cpu = current_cpu[isrc->is_domain];
705 			if (isrc->is_pic->pic_assign_cpu(isrc,
706 			    cpu_apic_ids[cpu]) == 0) {
707 				isrc->is_cpu = cpu;
708 				if (isrc->is_event->ie_cpu == NOCPU)
709 					intr_next_cpu(isrc->is_domain);
710 			}
711 		}
712 	}
713 	sx_xunlock(&intrsrc_lock);
714 }
715 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
716     NULL);
717 #endif
718 
719 /*
720  * TODO: Export this information in a non-MD fashion, integrate with vmstat -i.
721  */
722 static int
723 sysctl_hw_intrs(SYSCTL_HANDLER_ARGS)
724 {
725 	struct sbuf sbuf;
726 	struct intsrc *isrc;
727 	u_int i;
728 	int error;
729 
730 	error = sysctl_wire_old_buffer(req, 0);
731 	if (error != 0)
732 		return (error);
733 
734 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
735 	sx_slock(&intrsrc_lock);
736 	for (i = 0; i < num_io_irqs; i++) {
737 		isrc = interrupt_sources[i];
738 		if (isrc == NULL)
739 			continue;
740 		sbuf_printf(&sbuf, "%s:%d @cpu%d(domain%d): %ld\n",
741 		    isrc->is_event->ie_fullname,
742 		    isrc->is_index,
743 		    isrc->is_cpu,
744 		    isrc->is_domain,
745 		    *isrc->is_count);
746 	}
747 
748 	sx_sunlock(&intrsrc_lock);
749 	error = sbuf_finish(&sbuf);
750 	sbuf_delete(&sbuf);
751 	return (error);
752 }
753 SYSCTL_PROC(_hw, OID_AUTO, intrs, CTLTYPE_STRING | CTLFLAG_RW,
754     0, 0, sysctl_hw_intrs, "A", "interrupt:number @cpu: count");
755 
756 /*
757  * Compare two, possibly NULL, entries in the interrupt source array
758  * by load.
759  */
760 static int
761 intrcmp(const void *one, const void *two)
762 {
763 	const struct intsrc *i1, *i2;
764 
765 	i1 = *(const struct intsrc * const *)one;
766 	i2 = *(const struct intsrc * const *)two;
767 	if (i1 != NULL && i2 != NULL)
768 		return (*i1->is_count - *i2->is_count);
769 	if (i1 != NULL)
770 		return (1);
771 	if (i2 != NULL)
772 		return (-1);
773 	return (0);
774 }
775 
776 /*
777  * Balance IRQs across available CPUs according to load.
778  */
779 static void
780 intr_balance(void *dummy __unused, int pending __unused)
781 {
782 	struct intsrc *isrc;
783 	int interval;
784 	u_int cpu;
785 	int i;
786 
787 	interval = intrbalance;
788 	if (interval == 0)
789 		goto out;
790 
791 	/*
792 	 * Sort interrupts according to count.
793 	 */
794 	sx_xlock(&intrsrc_lock);
795 	memcpy(interrupt_sorted, interrupt_sources, num_io_irqs *
796 	    sizeof(interrupt_sorted[0]));
797 	qsort(interrupt_sorted, num_io_irqs, sizeof(interrupt_sorted[0]),
798 	    intrcmp);
799 
800 	/*
801 	 * Restart the scan from the same location to avoid moving in the
802 	 * common case.
803 	 */
804 	intr_init_cpus();
805 
806 	/*
807 	 * Assign round-robin from most loaded to least.
808 	 */
809 	for (i = num_io_irqs - 1; i >= 0; i--) {
810 		isrc = interrupt_sorted[i];
811 		if (isrc == NULL  || isrc->is_event->ie_cpu != NOCPU)
812 			continue;
813 		cpu = current_cpu[isrc->is_domain];
814 		intr_next_cpu(isrc->is_domain);
815 		if (isrc->is_cpu != cpu &&
816 		    isrc->is_pic->pic_assign_cpu(isrc,
817 		    cpu_apic_ids[cpu]) == 0)
818 			isrc->is_cpu = cpu;
819 	}
820 	sx_xunlock(&intrsrc_lock);
821 out:
822 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task,
823 	    interval ? hz * interval : hz * 60);
824 
825 }
826 
827 static void
828 intr_balance_init(void *dummy __unused)
829 {
830 
831 	TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance,
832 	    NULL);
833 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz);
834 }
835 SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL);
836 
837 #else
838 /*
839  * Always route interrupts to the current processor in the UP case.
840  */
841 u_int
842 intr_next_cpu(int domain)
843 {
844 
845 	return (PCPU_GET(apic_id));
846 }
847 #endif
848