xref: /freebsd/sys/x86/x86/intr_machdep.c (revision 4b9d6057)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * Machine dependent interrupt code for x86.  For x86, we have to
30  * deal with different PICs.  Thus, we use the passed in vector to lookup
31  * an interrupt source associated with that vector.  The interrupt source
32  * describes which PIC the source belongs to and includes methods to handle
33  * that source.
34  */
35 
36 #include "opt_atpic.h"
37 #include "opt_ddb.h"
38 #include "opt_smp.h"
39 
40 #include <sys/param.h>
41 #include <sys/bus.h>
42 #include <sys/interrupt.h>
43 #include <sys/ktr.h>
44 #include <sys/kernel.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mutex.h>
48 #include <sys/proc.h>
49 #include <sys/queue.h>
50 #include <sys/sbuf.h>
51 #include <sys/smp.h>
52 #include <sys/sx.h>
53 #include <sys/sysctl.h>
54 #include <sys/syslog.h>
55 #include <sys/systm.h>
56 #include <sys/taskqueue.h>
57 #include <sys/vmmeter.h>
58 #include <machine/clock.h>
59 #include <machine/intr_machdep.h>
60 #include <machine/smp.h>
61 #ifdef DDB
62 #include <ddb/ddb.h>
63 #endif
64 
65 #ifndef DEV_ATPIC
66 #include <machine/segments.h>
67 #include <machine/frame.h>
68 #include <dev/ic/i8259.h>
69 #include <x86/isa/icu.h>
70 #include <isa/isareg.h>
71 #endif
72 
73 #include <vm/vm.h>
74 
75 typedef void (*mask_fn)(void *);
76 
77 static int intrcnt_index;
78 static struct intsrc **interrupt_sources;
79 #ifdef SMP
80 static struct intsrc **interrupt_sorted;
81 static int intrbalance;
82 SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RWTUN, &intrbalance, 0,
83     "Interrupt auto-balance interval (seconds).  Zero disables.");
84 static struct timeout_task intrbalance_task;
85 #endif
86 static struct sx intrsrc_lock;
87 static struct mtx intrpic_lock;
88 static struct mtx intrcnt_lock;
89 static TAILQ_HEAD(pics_head, pic) pics;
90 u_int num_io_irqs;
91 
92 #if defined(SMP) && !defined(EARLY_AP_STARTUP)
93 #error EARLY_AP_STARTUP required on x86
94 #endif
95 
96 #define	INTRNAME_LEN	(MAXCOMLEN + 1)
97 u_long *intrcnt;
98 char *intrnames;
99 size_t sintrcnt = sizeof(intrcnt);
100 size_t sintrnames = sizeof(intrnames);
101 int nintrcnt;
102 
103 static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources");
104 
105 static int	intr_assign_cpu(void *arg, int cpu);
106 static void	intr_disable_src(void *arg);
107 static void	intr_init(void *__dummy);
108 static int	intr_pic_registered(struct pic *pic);
109 static void	intrcnt_setname(const char *name, int index);
110 static void	intrcnt_updatename(struct intsrc *is);
111 static void	intrcnt_register(struct intsrc *is);
112 
113 /*
114  * SYSINIT levels for SI_SUB_INTR:
115  *
116  * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init
117  * SI_ORDER_SECOND: Xen PICs
118  * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges
119  * SI_ORDER_FOURTH: Add 8259A PICs
120  * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources
121  * SI_ORDER_MIDDLE: SMP interrupt counters
122  * SI_ORDER_ANY: Enable interrupts on BSP
123  */
124 
125 static int
126 intr_pic_registered(struct pic *pic)
127 {
128 	struct pic *p;
129 
130 	TAILQ_FOREACH(p, &pics, pics) {
131 		if (p == pic)
132 			return (1);
133 	}
134 	return (0);
135 }
136 
137 /*
138  * Register a new interrupt controller (PIC).  This is to support suspend
139  * and resume where we suspend/resume controllers rather than individual
140  * sources.  This also allows controllers with no active sources (such as
141  * 8259As in a system using the APICs) to participate in suspend and resume.
142  */
143 int
144 intr_register_pic(struct pic *pic)
145 {
146 	int error;
147 
148 	mtx_lock(&intrpic_lock);
149 	if (intr_pic_registered(pic))
150 		error = EBUSY;
151 	else {
152 		TAILQ_INSERT_TAIL(&pics, pic, pics);
153 		error = 0;
154 	}
155 	mtx_unlock(&intrpic_lock);
156 	return (error);
157 }
158 
159 /*
160  * Allocate interrupt source arrays and register interrupt sources
161  * once the number of interrupts is known.
162  */
163 static void
164 intr_init_sources(void *arg)
165 {
166 	struct pic *pic;
167 
168 	MPASS(num_io_irqs > 0);
169 
170 	interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources),
171 	    M_INTR, M_WAITOK | M_ZERO);
172 #ifdef SMP
173 	interrupt_sorted = mallocarray(num_io_irqs, sizeof(*interrupt_sorted),
174 	    M_INTR, M_WAITOK | M_ZERO);
175 #endif
176 
177 	/*
178 	 * - 1 ??? dummy counter.
179 	 * - 2 counters for each I/O interrupt.
180 	 * - 1 counter for each CPU for lapic timer.
181 	 * - 1 counter for each CPU for the Hyper-V vmbus driver.
182 	 * - 8 counters for each CPU for IPI counters for SMP.
183 	 */
184 	nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2;
185 #ifdef COUNT_IPIS
186 	if (mp_ncpus > 1)
187 		nintrcnt += 8 * mp_ncpus;
188 #endif
189 	intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK |
190 	    M_ZERO);
191 	intrnames = mallocarray(nintrcnt, INTRNAME_LEN, M_INTR, M_WAITOK |
192 	    M_ZERO);
193 	sintrcnt = nintrcnt * sizeof(u_long);
194 	sintrnames = nintrcnt * INTRNAME_LEN;
195 
196 	intrcnt_setname("???", 0);
197 	intrcnt_index = 1;
198 
199 	/*
200 	 * NB: intrpic_lock is not held here to avoid LORs due to
201 	 * malloc() in intr_register_source().  However, we are still
202 	 * single-threaded at this point in startup so the list of
203 	 * PICs shouldn't change.
204 	 */
205 	TAILQ_FOREACH(pic, &pics, pics) {
206 		if (pic->pic_register_sources != NULL)
207 			pic->pic_register_sources(pic);
208 	}
209 }
210 SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources,
211     NULL);
212 
213 /*
214  * Register a new interrupt source with the global interrupt system.
215  * The global interrupts need to be disabled when this function is
216  * called.
217  */
218 int
219 intr_register_source(struct intsrc *isrc)
220 {
221 	int error, vector;
222 
223 	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
224 	vector = isrc->is_pic->pic_vector(isrc);
225 	KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector,
226 	    num_io_irqs));
227 	if (interrupt_sources[vector] != NULL)
228 		return (EEXIST);
229 	error = intr_event_create(&isrc->is_event, isrc, 0, vector,
230 	    intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source,
231 	    (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:",
232 	    vector);
233 	if (error)
234 		return (error);
235 	sx_xlock(&intrsrc_lock);
236 	if (interrupt_sources[vector] != NULL) {
237 		sx_xunlock(&intrsrc_lock);
238 		intr_event_destroy(isrc->is_event);
239 		return (EEXIST);
240 	}
241 	intrcnt_register(isrc);
242 	interrupt_sources[vector] = isrc;
243 	isrc->is_handlers = 0;
244 	sx_xunlock(&intrsrc_lock);
245 	return (0);
246 }
247 
248 struct intsrc *
249 intr_lookup_source(int vector)
250 {
251 
252 	if (vector < 0 || vector >= num_io_irqs)
253 		return (NULL);
254 	return (interrupt_sources[vector]);
255 }
256 
257 int
258 intr_add_handler(const char *name, int vector, driver_filter_t filter,
259     driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
260     int domain)
261 {
262 	struct intsrc *isrc;
263 	int error;
264 
265 	isrc = intr_lookup_source(vector);
266 	if (isrc == NULL)
267 		return (EINVAL);
268 	error = intr_event_add_handler(isrc->is_event, name, filter, handler,
269 	    arg, intr_priority(flags), flags, cookiep);
270 	if (error == 0) {
271 		sx_xlock(&intrsrc_lock);
272 		intrcnt_updatename(isrc);
273 		isrc->is_handlers++;
274 		if (isrc->is_handlers == 1) {
275 			isrc->is_domain = domain;
276 			isrc->is_pic->pic_enable_intr(isrc);
277 			isrc->is_pic->pic_enable_source(isrc);
278 		}
279 		sx_xunlock(&intrsrc_lock);
280 	}
281 	return (error);
282 }
283 
284 int
285 intr_remove_handler(void *cookie)
286 {
287 	struct intsrc *isrc;
288 	int error;
289 
290 	isrc = intr_handler_source(cookie);
291 	error = intr_event_remove_handler(cookie);
292 	if (error == 0) {
293 		sx_xlock(&intrsrc_lock);
294 		isrc->is_handlers--;
295 		if (isrc->is_handlers == 0) {
296 			isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
297 			isrc->is_pic->pic_disable_intr(isrc);
298 		}
299 		intrcnt_updatename(isrc);
300 		sx_xunlock(&intrsrc_lock);
301 	}
302 	return (error);
303 }
304 
305 int
306 intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol)
307 {
308 	struct intsrc *isrc;
309 
310 	isrc = intr_lookup_source(vector);
311 	if (isrc == NULL)
312 		return (EINVAL);
313 	return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
314 }
315 
316 static void
317 intr_disable_src(void *arg)
318 {
319 	struct intsrc *isrc;
320 
321 	isrc = arg;
322 	isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
323 }
324 
325 void
326 intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
327 {
328 	struct intr_event *ie;
329 	int vector;
330 
331 	/*
332 	 * We count software interrupts when we process them.  The
333 	 * code here follows previous practice, but there's an
334 	 * argument for counting hardware interrupts when they're
335 	 * processed too.
336 	 */
337 	(*isrc->is_count)++;
338 	VM_CNT_INC(v_intr);
339 
340 	ie = isrc->is_event;
341 
342 	/*
343 	 * XXX: We assume that IRQ 0 is only used for the ISA timer
344 	 * device (clk).
345 	 */
346 	vector = isrc->is_pic->pic_vector(isrc);
347 	if (vector == 0)
348 		clkintr_pending = 1;
349 
350 	/*
351 	 * For stray interrupts, mask and EOI the source, bump the
352 	 * stray count, and log the condition.
353 	 */
354 	if (intr_event_handle(ie, frame) != 0) {
355 		isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
356 		(*isrc->is_straycount)++;
357 		if (*isrc->is_straycount < INTR_STRAY_LOG_MAX)
358 			log(LOG_ERR, "stray irq%d\n", vector);
359 		else if (*isrc->is_straycount == INTR_STRAY_LOG_MAX)
360 			log(LOG_CRIT,
361 			    "too many stray irq %d's: not logging anymore\n",
362 			    vector);
363 	}
364 }
365 
366 void
367 intr_resume(bool suspend_cancelled)
368 {
369 	struct pic *pic;
370 
371 #ifndef DEV_ATPIC
372 	atpic_reset();
373 #endif
374 	mtx_lock(&intrpic_lock);
375 	TAILQ_FOREACH(pic, &pics, pics) {
376 		if (pic->pic_resume != NULL)
377 			pic->pic_resume(pic, suspend_cancelled);
378 	}
379 	mtx_unlock(&intrpic_lock);
380 }
381 
382 void
383 intr_suspend(void)
384 {
385 	struct pic *pic;
386 
387 	mtx_lock(&intrpic_lock);
388 	TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) {
389 		if (pic->pic_suspend != NULL)
390 			pic->pic_suspend(pic);
391 	}
392 	mtx_unlock(&intrpic_lock);
393 }
394 
395 static int
396 intr_assign_cpu(void *arg, int cpu)
397 {
398 #ifdef SMP
399 	struct intsrc *isrc;
400 	int error;
401 
402 	MPASS(mp_ncpus == 1 || smp_started);
403 
404 	/* Nothing to do if there is only a single CPU. */
405 	if (mp_ncpus > 1 && cpu != NOCPU) {
406 		isrc = arg;
407 		sx_xlock(&intrsrc_lock);
408 		error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
409 		if (error == 0)
410 			isrc->is_cpu = cpu;
411 		sx_xunlock(&intrsrc_lock);
412 	} else
413 		error = 0;
414 	return (error);
415 #else
416 	return (EOPNOTSUPP);
417 #endif
418 }
419 
420 static void
421 intrcnt_setname(const char *name, int index)
422 {
423 
424 	snprintf(intrnames + INTRNAME_LEN * index, INTRNAME_LEN, "%-*s",
425 	    INTRNAME_LEN - 1, name);
426 }
427 
428 static void
429 intrcnt_updatename(struct intsrc *is)
430 {
431 
432 	intrcnt_setname(is->is_event->ie_fullname, is->is_index);
433 }
434 
435 static void
436 intrcnt_register(struct intsrc *is)
437 {
438 	char straystr[INTRNAME_LEN];
439 
440 	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
441 	mtx_lock_spin(&intrcnt_lock);
442 	MPASS(intrcnt_index + 2 <= nintrcnt);
443 	is->is_index = intrcnt_index;
444 	intrcnt_index += 2;
445 	snprintf(straystr, sizeof(straystr), "stray irq%d",
446 	    is->is_pic->pic_vector(is));
447 	intrcnt_updatename(is);
448 	is->is_count = &intrcnt[is->is_index];
449 	intrcnt_setname(straystr, is->is_index + 1);
450 	is->is_straycount = &intrcnt[is->is_index + 1];
451 	mtx_unlock_spin(&intrcnt_lock);
452 }
453 
454 void
455 intrcnt_add(const char *name, u_long **countp)
456 {
457 
458 	mtx_lock_spin(&intrcnt_lock);
459 	MPASS(intrcnt_index < nintrcnt);
460 	*countp = &intrcnt[intrcnt_index];
461 	intrcnt_setname(name, intrcnt_index);
462 	intrcnt_index++;
463 	mtx_unlock_spin(&intrcnt_lock);
464 }
465 
466 static void
467 intr_init(void *dummy __unused)
468 {
469 
470 	TAILQ_INIT(&pics);
471 	mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
472 	sx_init(&intrsrc_lock, "intrsrc");
473 	mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
474 }
475 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
476 
477 static void
478 intr_init_final(void *dummy __unused)
479 {
480 
481 	/*
482 	 * Enable interrupts on the BSP after all of the interrupt
483 	 * controllers are initialized.  Device interrupts are still
484 	 * disabled in the interrupt controllers until interrupt
485 	 * handlers are registered.  Interrupts are enabled on each AP
486 	 * after their first context switch.
487 	 */
488 	enable_intr();
489 }
490 SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
491 
492 #ifndef DEV_ATPIC
493 /* Initialize the two 8259A's to a known-good shutdown state. */
494 void
495 atpic_reset(void)
496 {
497 
498 	outb(IO_ICU1, ICW1_RESET | ICW1_IC4);
499 	outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS);
500 	outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID));
501 	outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE);
502 	outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
503 	outb(IO_ICU1, OCW3_SEL | OCW3_RR);
504 
505 	outb(IO_ICU2, ICW1_RESET | ICW1_IC4);
506 	outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8);
507 	outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID);
508 	outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE);
509 	outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
510 	outb(IO_ICU2, OCW3_SEL | OCW3_RR);
511 }
512 #endif
513 
514 /* Add a description to an active interrupt handler. */
515 int
516 intr_describe(u_int vector, void *ih, const char *descr)
517 {
518 	struct intsrc *isrc;
519 	int error;
520 
521 	isrc = intr_lookup_source(vector);
522 	if (isrc == NULL)
523 		return (EINVAL);
524 	error = intr_event_describe_handler(isrc->is_event, ih, descr);
525 	if (error)
526 		return (error);
527 	intrcnt_updatename(isrc);
528 	return (0);
529 }
530 
531 void
532 intr_reprogram(void)
533 {
534 	struct intsrc *is;
535 	u_int v;
536 
537 	sx_xlock(&intrsrc_lock);
538 	for (v = 0; v < num_io_irqs; v++) {
539 		is = interrupt_sources[v];
540 		if (is == NULL)
541 			continue;
542 		if (is->is_pic->pic_reprogram_pin != NULL)
543 			is->is_pic->pic_reprogram_pin(is);
544 	}
545 	sx_xunlock(&intrsrc_lock);
546 }
547 
548 #ifdef DDB
549 /*
550  * Dump data about interrupt handlers
551  */
552 DB_SHOW_COMMAND(irqs, db_show_irqs)
553 {
554 	struct intsrc **isrc;
555 	u_int i;
556 	int verbose;
557 
558 	if (strcmp(modif, "v") == 0)
559 		verbose = 1;
560 	else
561 		verbose = 0;
562 	isrc = interrupt_sources;
563 	for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++)
564 		if (*isrc != NULL)
565 			db_dump_intr_event((*isrc)->is_event, verbose);
566 }
567 #endif
568 
569 #ifdef SMP
570 /*
571  * Support for balancing interrupt sources across CPUs.  For now we just
572  * allocate CPUs round-robin.
573  *
574  * XXX If the system has a domain with without any usable CPUs (e.g., where all
575  * APIC IDs are 256 or greater and we do not have an IOMMU) we use
576  * intr_no_domain to fall back to assigning interrupts without regard for
577  * domain.  Once we can rely on the presence of an IOMMU on all x86 platforms
578  * we can revert this.
579  */
580 
581 cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
582 static int current_cpu[MAXMEMDOM];
583 static bool intr_no_domain;
584 
585 static void
586 intr_init_cpus(void)
587 {
588 	int i;
589 
590 	for (i = 0; i < vm_ndomains; i++) {
591 		if (CPU_OVERLAP(&cpuset_domain[i], &intr_cpus) == 0) {
592 			intr_no_domain = true;
593 			printf("%s: unable to route interrupts to CPUs in domain %d\n",
594 			    __func__, i);
595 		}
596 
597 		current_cpu[i] = 0;
598 		if (intr_no_domain && i > 0)
599 			continue;
600 		if (!CPU_ISSET(current_cpu[i], &intr_cpus) ||
601 		    !CPU_ISSET(current_cpu[i], &cpuset_domain[i]))
602 			intr_next_cpu(i);
603 	}
604 }
605 
606 /*
607  * Return the CPU that the next interrupt source should use.  For now
608  * this just returns the next local APIC according to round-robin.
609  */
610 u_int
611 intr_next_cpu(int domain)
612 {
613 	u_int apic_id;
614 
615 	MPASS(mp_ncpus == 1 || smp_started);
616 	if (mp_ncpus == 1)
617 		return (PCPU_GET(apic_id));
618 
619 	if (intr_no_domain)
620 		domain = 0;
621 	mtx_lock_spin(&icu_lock);
622 	apic_id = cpu_apic_ids[current_cpu[domain]];
623 	do {
624 		current_cpu[domain]++;
625 		if (current_cpu[domain] > mp_maxid)
626 			current_cpu[domain] = 0;
627 	} while (!CPU_ISSET(current_cpu[domain], &intr_cpus) ||
628 	    (!CPU_ISSET(current_cpu[domain], &cpuset_domain[domain]) &&
629 	    !intr_no_domain));
630 	mtx_unlock_spin(&icu_lock);
631 	return (apic_id);
632 }
633 
634 /*
635  * Add a CPU to our mask of valid CPUs that can be destinations of
636  * interrupts.
637  */
638 void
639 intr_add_cpu(u_int cpu)
640 {
641 
642 	if (cpu >= MAXCPU)
643 		panic("%s: Invalid CPU ID %u", __func__, cpu);
644 	if (bootverbose)
645 		printf("INTR: Adding local APIC %d as a target\n",
646 		    cpu_apic_ids[cpu]);
647 
648 	CPU_SET(cpu, &intr_cpus);
649 }
650 
651 static void
652 intr_smp_startup(void *arg __unused)
653 {
654 
655 	intr_init_cpus();
656 	return;
657 }
658 SYSINIT(intr_smp_startup, SI_SUB_SMP, SI_ORDER_SECOND, intr_smp_startup,
659     NULL);
660 
661 /*
662  * TODO: Export this information in a non-MD fashion, integrate with vmstat -i.
663  */
664 static int
665 sysctl_hw_intrs(SYSCTL_HANDLER_ARGS)
666 {
667 	struct sbuf sbuf;
668 	struct intsrc *isrc;
669 	u_int i;
670 	int error;
671 
672 	error = sysctl_wire_old_buffer(req, 0);
673 	if (error != 0)
674 		return (error);
675 
676 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
677 	sx_slock(&intrsrc_lock);
678 	for (i = 0; i < num_io_irqs; i++) {
679 		isrc = interrupt_sources[i];
680 		if (isrc == NULL)
681 			continue;
682 		sbuf_printf(&sbuf, "%s:%d @cpu%d(domain%d): %ld\n",
683 		    isrc->is_event->ie_fullname,
684 		    isrc->is_index,
685 		    isrc->is_cpu,
686 		    isrc->is_domain,
687 		    *isrc->is_count);
688 	}
689 
690 	sx_sunlock(&intrsrc_lock);
691 	error = sbuf_finish(&sbuf);
692 	sbuf_delete(&sbuf);
693 	return (error);
694 }
695 SYSCTL_PROC(_hw, OID_AUTO, intrs,
696     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
697     0, 0, sysctl_hw_intrs, "A",
698     "interrupt:number @cpu: count");
699 
700 /*
701  * Compare two, possibly NULL, entries in the interrupt source array
702  * by load.
703  */
704 static int
705 intrcmp(const void *one, const void *two)
706 {
707 	const struct intsrc *i1, *i2;
708 
709 	i1 = *(const struct intsrc * const *)one;
710 	i2 = *(const struct intsrc * const *)two;
711 	if (i1 != NULL && i2 != NULL)
712 		return (*i1->is_count - *i2->is_count);
713 	if (i1 != NULL)
714 		return (1);
715 	if (i2 != NULL)
716 		return (-1);
717 	return (0);
718 }
719 
720 /*
721  * Balance IRQs across available CPUs according to load.
722  */
723 static void
724 intr_balance(void *dummy __unused, int pending __unused)
725 {
726 	struct intsrc *isrc;
727 	int interval;
728 	u_int cpu;
729 	int i;
730 
731 	interval = intrbalance;
732 	if (interval == 0)
733 		goto out;
734 
735 	/*
736 	 * Sort interrupts according to count.
737 	 */
738 	sx_xlock(&intrsrc_lock);
739 	memcpy(interrupt_sorted, interrupt_sources, num_io_irqs *
740 	    sizeof(interrupt_sorted[0]));
741 	qsort(interrupt_sorted, num_io_irqs, sizeof(interrupt_sorted[0]),
742 	    intrcmp);
743 
744 	/*
745 	 * Restart the scan from the same location to avoid moving in the
746 	 * common case.
747 	 */
748 	intr_init_cpus();
749 
750 	/*
751 	 * Assign round-robin from most loaded to least.
752 	 */
753 	for (i = num_io_irqs - 1; i >= 0; i--) {
754 		isrc = interrupt_sorted[i];
755 		if (isrc == NULL  || isrc->is_event->ie_cpu != NOCPU)
756 			continue;
757 		cpu = current_cpu[isrc->is_domain];
758 		intr_next_cpu(isrc->is_domain);
759 		if (isrc->is_cpu != cpu &&
760 		    isrc->is_pic->pic_assign_cpu(isrc,
761 		    cpu_apic_ids[cpu]) == 0)
762 			isrc->is_cpu = cpu;
763 	}
764 	sx_xunlock(&intrsrc_lock);
765 out:
766 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task,
767 	    interval ? hz * interval : hz * 60);
768 
769 }
770 
771 static void
772 intr_balance_init(void *dummy __unused)
773 {
774 
775 	TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance,
776 	    NULL);
777 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz);
778 }
779 SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL);
780 
781 #else
782 /*
783  * Always route interrupts to the current processor in the UP case.
784  */
785 u_int
786 intr_next_cpu(int domain)
787 {
788 
789 	return (PCPU_GET(apic_id));
790 }
791 #endif
792