xref: /freebsd/sys/x86/x86/intr_machdep.c (revision 190cef3d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 /*
32  * Machine dependent interrupt code for x86.  For x86, we have to
33  * deal with different PICs.  Thus, we use the passed in vector to lookup
34  * an interrupt source associated with that vector.  The interrupt source
35  * describes which PIC the source belongs to and includes methods to handle
36  * that source.
37  */
38 
39 #include "opt_atpic.h"
40 #include "opt_ddb.h"
41 #include "opt_smp.h"
42 
43 #include <sys/param.h>
44 #include <sys/bus.h>
45 #include <sys/interrupt.h>
46 #include <sys/ktr.h>
47 #include <sys/kernel.h>
48 #include <sys/lock.h>
49 #include <sys/malloc.h>
50 #include <sys/mutex.h>
51 #include <sys/proc.h>
52 #include <sys/queue.h>
53 #include <sys/sbuf.h>
54 #include <sys/smp.h>
55 #include <sys/sx.h>
56 #include <sys/sysctl.h>
57 #include <sys/syslog.h>
58 #include <sys/systm.h>
59 #include <sys/taskqueue.h>
60 #include <sys/vmmeter.h>
61 #include <machine/clock.h>
62 #include <machine/intr_machdep.h>
63 #include <machine/smp.h>
64 #ifdef DDB
65 #include <ddb/ddb.h>
66 #endif
67 
68 #ifndef DEV_ATPIC
69 #include <machine/segments.h>
70 #include <machine/frame.h>
71 #include <dev/ic/i8259.h>
72 #include <x86/isa/icu.h>
73 #include <isa/isareg.h>
74 #endif
75 
76 #include <vm/vm.h>
77 
78 #define	MAX_STRAY_LOG	5
79 
80 typedef void (*mask_fn)(void *);
81 
82 static int intrcnt_index;
83 static struct intsrc **interrupt_sources;
84 #ifdef SMP
85 static struct intsrc **interrupt_sorted;
86 static int intrbalance;
87 SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RW, &intrbalance, 0,
88     "Interrupt auto-balance interval (seconds).  Zero disables.");
89 static struct timeout_task intrbalance_task;
90 #endif
91 static struct sx intrsrc_lock;
92 static struct mtx intrpic_lock;
93 static struct mtx intrcnt_lock;
94 static TAILQ_HEAD(pics_head, pic) pics;
95 u_int num_io_irqs;
96 
97 #if defined(SMP) && !defined(EARLY_AP_STARTUP)
98 static int assign_cpu;
99 #endif
100 
101 u_long *intrcnt;
102 char *intrnames;
103 size_t sintrcnt = sizeof(intrcnt);
104 size_t sintrnames = sizeof(intrnames);
105 int nintrcnt;
106 
107 static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources");
108 
109 static int	intr_assign_cpu(void *arg, int cpu);
110 static void	intr_disable_src(void *arg);
111 static void	intr_init(void *__dummy);
112 static int	intr_pic_registered(struct pic *pic);
113 static void	intrcnt_setname(const char *name, int index);
114 static void	intrcnt_updatename(struct intsrc *is);
115 static void	intrcnt_register(struct intsrc *is);
116 
117 /*
118  * SYSINIT levels for SI_SUB_INTR:
119  *
120  * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init
121  * SI_ORDER_SECOND: Xen PICs
122  * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges
123  * SI_ORDER_FOURTH: Add 8259A PICs
124  * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources
125  * SI_ORDER_MIDDLE: SMP interrupt counters
126  * SI_ORDER_ANY: Enable interrupts on BSP
127  */
128 
129 static int
130 intr_pic_registered(struct pic *pic)
131 {
132 	struct pic *p;
133 
134 	TAILQ_FOREACH(p, &pics, pics) {
135 		if (p == pic)
136 			return (1);
137 	}
138 	return (0);
139 }
140 
141 /*
142  * Register a new interrupt controller (PIC).  This is to support suspend
143  * and resume where we suspend/resume controllers rather than individual
144  * sources.  This also allows controllers with no active sources (such as
145  * 8259As in a system using the APICs) to participate in suspend and resume.
146  */
147 int
148 intr_register_pic(struct pic *pic)
149 {
150 	int error;
151 
152 	mtx_lock(&intrpic_lock);
153 	if (intr_pic_registered(pic))
154 		error = EBUSY;
155 	else {
156 		TAILQ_INSERT_TAIL(&pics, pic, pics);
157 		error = 0;
158 	}
159 	mtx_unlock(&intrpic_lock);
160 	return (error);
161 }
162 
163 /*
164  * Allocate interrupt source arrays and register interrupt sources
165  * once the number of interrupts is known.
166  */
167 static void
168 intr_init_sources(void *arg)
169 {
170 	struct pic *pic;
171 
172 	MPASS(num_io_irqs > 0);
173 
174 	interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources),
175 	    M_INTR, M_WAITOK | M_ZERO);
176 #ifdef SMP
177 	interrupt_sorted = mallocarray(num_io_irqs, sizeof(*interrupt_sorted),
178 	    M_INTR, M_WAITOK | M_ZERO);
179 #endif
180 
181 	/*
182 	 * - 1 ??? dummy counter.
183 	 * - 2 counters for each I/O interrupt.
184 	 * - 1 counter for each CPU for lapic timer.
185 	 * - 1 counter for each CPU for the Hyper-V vmbus driver.
186 	 * - 8 counters for each CPU for IPI counters for SMP.
187 	 */
188 	nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2;
189 #ifdef COUNT_IPIS
190 	if (mp_ncpus > 1)
191 		nintrcnt += 8 * mp_ncpus;
192 #endif
193 	intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK |
194 	    M_ZERO);
195 	intrnames = mallocarray(nintrcnt, MAXCOMLEN + 1, M_INTR, M_WAITOK |
196 	    M_ZERO);
197 	sintrcnt = nintrcnt * sizeof(u_long);
198 	sintrnames = nintrcnt * (MAXCOMLEN + 1);
199 
200 	intrcnt_setname("???", 0);
201 	intrcnt_index = 1;
202 
203 	/*
204 	 * NB: intrpic_lock is not held here to avoid LORs due to
205 	 * malloc() in intr_register_source().  However, we are still
206 	 * single-threaded at this point in startup so the list of
207 	 * PICs shouldn't change.
208 	 */
209 	TAILQ_FOREACH(pic, &pics, pics) {
210 		if (pic->pic_register_sources != NULL)
211 			pic->pic_register_sources(pic);
212 	}
213 }
214 SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources,
215     NULL);
216 
217 /*
218  * Register a new interrupt source with the global interrupt system.
219  * The global interrupts need to be disabled when this function is
220  * called.
221  */
222 int
223 intr_register_source(struct intsrc *isrc)
224 {
225 	int error, vector;
226 
227 	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
228 	vector = isrc->is_pic->pic_vector(isrc);
229 	KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector,
230 	    num_io_irqs));
231 	if (interrupt_sources[vector] != NULL)
232 		return (EEXIST);
233 	error = intr_event_create(&isrc->is_event, isrc, 0, vector,
234 	    intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source,
235 	    (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:",
236 	    vector);
237 	if (error)
238 		return (error);
239 	sx_xlock(&intrsrc_lock);
240 	if (interrupt_sources[vector] != NULL) {
241 		sx_xunlock(&intrsrc_lock);
242 		intr_event_destroy(isrc->is_event);
243 		return (EEXIST);
244 	}
245 	intrcnt_register(isrc);
246 	interrupt_sources[vector] = isrc;
247 	isrc->is_handlers = 0;
248 	sx_xunlock(&intrsrc_lock);
249 	return (0);
250 }
251 
252 struct intsrc *
253 intr_lookup_source(int vector)
254 {
255 
256 	if (vector < 0 || vector >= num_io_irqs)
257 		return (NULL);
258 	return (interrupt_sources[vector]);
259 }
260 
261 int
262 intr_add_handler(const char *name, int vector, driver_filter_t filter,
263     driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
264     int domain)
265 {
266 	struct intsrc *isrc;
267 	int error;
268 
269 	isrc = intr_lookup_source(vector);
270 	if (isrc == NULL)
271 		return (EINVAL);
272 	error = intr_event_add_handler(isrc->is_event, name, filter, handler,
273 	    arg, intr_priority(flags), flags, cookiep);
274 	if (error == 0) {
275 		sx_xlock(&intrsrc_lock);
276 		intrcnt_updatename(isrc);
277 		isrc->is_handlers++;
278 		if (isrc->is_handlers == 1) {
279 			isrc->is_domain = domain;
280 			isrc->is_pic->pic_enable_intr(isrc);
281 			isrc->is_pic->pic_enable_source(isrc);
282 		}
283 		sx_xunlock(&intrsrc_lock);
284 	}
285 	return (error);
286 }
287 
288 int
289 intr_remove_handler(void *cookie)
290 {
291 	struct intsrc *isrc;
292 	int error;
293 
294 	isrc = intr_handler_source(cookie);
295 	error = intr_event_remove_handler(cookie);
296 	if (error == 0) {
297 		sx_xlock(&intrsrc_lock);
298 		isrc->is_handlers--;
299 		if (isrc->is_handlers == 0) {
300 			isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
301 			isrc->is_pic->pic_disable_intr(isrc);
302 		}
303 		intrcnt_updatename(isrc);
304 		sx_xunlock(&intrsrc_lock);
305 	}
306 	return (error);
307 }
308 
309 int
310 intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol)
311 {
312 	struct intsrc *isrc;
313 
314 	isrc = intr_lookup_source(vector);
315 	if (isrc == NULL)
316 		return (EINVAL);
317 	return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
318 }
319 
320 static void
321 intr_disable_src(void *arg)
322 {
323 	struct intsrc *isrc;
324 
325 	isrc = arg;
326 	isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
327 }
328 
329 void
330 intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
331 {
332 	struct intr_event *ie;
333 	int vector;
334 
335 	/*
336 	 * We count software interrupts when we process them.  The
337 	 * code here follows previous practice, but there's an
338 	 * argument for counting hardware interrupts when they're
339 	 * processed too.
340 	 */
341 	(*isrc->is_count)++;
342 	VM_CNT_INC(v_intr);
343 
344 	ie = isrc->is_event;
345 
346 	/*
347 	 * XXX: We assume that IRQ 0 is only used for the ISA timer
348 	 * device (clk).
349 	 */
350 	vector = isrc->is_pic->pic_vector(isrc);
351 	if (vector == 0)
352 		clkintr_pending = 1;
353 
354 	/*
355 	 * For stray interrupts, mask and EOI the source, bump the
356 	 * stray count, and log the condition.
357 	 */
358 	if (intr_event_handle(ie, frame) != 0) {
359 		isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
360 		(*isrc->is_straycount)++;
361 		if (*isrc->is_straycount < MAX_STRAY_LOG)
362 			log(LOG_ERR, "stray irq%d\n", vector);
363 		else if (*isrc->is_straycount == MAX_STRAY_LOG)
364 			log(LOG_CRIT,
365 			    "too many stray irq %d's: not logging anymore\n",
366 			    vector);
367 	}
368 }
369 
370 void
371 intr_resume(bool suspend_cancelled)
372 {
373 	struct pic *pic;
374 
375 #ifndef DEV_ATPIC
376 	atpic_reset();
377 #endif
378 	mtx_lock(&intrpic_lock);
379 	TAILQ_FOREACH(pic, &pics, pics) {
380 		if (pic->pic_resume != NULL)
381 			pic->pic_resume(pic, suspend_cancelled);
382 	}
383 	mtx_unlock(&intrpic_lock);
384 }
385 
386 void
387 intr_suspend(void)
388 {
389 	struct pic *pic;
390 
391 	mtx_lock(&intrpic_lock);
392 	TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) {
393 		if (pic->pic_suspend != NULL)
394 			pic->pic_suspend(pic);
395 	}
396 	mtx_unlock(&intrpic_lock);
397 }
398 
399 static int
400 intr_assign_cpu(void *arg, int cpu)
401 {
402 #ifdef SMP
403 	struct intsrc *isrc;
404 	int error;
405 
406 #ifdef EARLY_AP_STARTUP
407 	MPASS(mp_ncpus == 1 || smp_started);
408 
409 	/* Nothing to do if there is only a single CPU. */
410 	if (mp_ncpus > 1 && cpu != NOCPU) {
411 #else
412 	/*
413 	 * Don't do anything during early boot.  We will pick up the
414 	 * assignment once the APs are started.
415 	 */
416 	if (assign_cpu && cpu != NOCPU) {
417 #endif
418 		isrc = arg;
419 		sx_xlock(&intrsrc_lock);
420 		error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
421 		if (error == 0)
422 			isrc->is_cpu = cpu;
423 		sx_xunlock(&intrsrc_lock);
424 	} else
425 		error = 0;
426 	return (error);
427 #else
428 	return (EOPNOTSUPP);
429 #endif
430 }
431 
432 static void
433 intrcnt_setname(const char *name, int index)
434 {
435 
436 	snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s",
437 	    MAXCOMLEN, name);
438 }
439 
440 static void
441 intrcnt_updatename(struct intsrc *is)
442 {
443 
444 	intrcnt_setname(is->is_event->ie_fullname, is->is_index);
445 }
446 
447 static void
448 intrcnt_register(struct intsrc *is)
449 {
450 	char straystr[MAXCOMLEN + 1];
451 
452 	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
453 	mtx_lock_spin(&intrcnt_lock);
454 	MPASS(intrcnt_index + 2 <= nintrcnt);
455 	is->is_index = intrcnt_index;
456 	intrcnt_index += 2;
457 	snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
458 	    is->is_pic->pic_vector(is));
459 	intrcnt_updatename(is);
460 	is->is_count = &intrcnt[is->is_index];
461 	intrcnt_setname(straystr, is->is_index + 1);
462 	is->is_straycount = &intrcnt[is->is_index + 1];
463 	mtx_unlock_spin(&intrcnt_lock);
464 }
465 
466 void
467 intrcnt_add(const char *name, u_long **countp)
468 {
469 
470 	mtx_lock_spin(&intrcnt_lock);
471 	MPASS(intrcnt_index < nintrcnt);
472 	*countp = &intrcnt[intrcnt_index];
473 	intrcnt_setname(name, intrcnt_index);
474 	intrcnt_index++;
475 	mtx_unlock_spin(&intrcnt_lock);
476 }
477 
478 static void
479 intr_init(void *dummy __unused)
480 {
481 
482 	TAILQ_INIT(&pics);
483 	mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
484 	sx_init(&intrsrc_lock, "intrsrc");
485 	mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
486 }
487 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
488 
489 static void
490 intr_init_final(void *dummy __unused)
491 {
492 
493 	/*
494 	 * Enable interrupts on the BSP after all of the interrupt
495 	 * controllers are initialized.  Device interrupts are still
496 	 * disabled in the interrupt controllers until interrupt
497 	 * handlers are registered.  Interrupts are enabled on each AP
498 	 * after their first context switch.
499 	 */
500 	enable_intr();
501 }
502 SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
503 
504 #ifndef DEV_ATPIC
505 /* Initialize the two 8259A's to a known-good shutdown state. */
506 void
507 atpic_reset(void)
508 {
509 
510 	outb(IO_ICU1, ICW1_RESET | ICW1_IC4);
511 	outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS);
512 	outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID));
513 	outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE);
514 	outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
515 	outb(IO_ICU1, OCW3_SEL | OCW3_RR);
516 
517 	outb(IO_ICU2, ICW1_RESET | ICW1_IC4);
518 	outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8);
519 	outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID);
520 	outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE);
521 	outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
522 	outb(IO_ICU2, OCW3_SEL | OCW3_RR);
523 }
524 #endif
525 
526 /* Add a description to an active interrupt handler. */
527 int
528 intr_describe(u_int vector, void *ih, const char *descr)
529 {
530 	struct intsrc *isrc;
531 	int error;
532 
533 	isrc = intr_lookup_source(vector);
534 	if (isrc == NULL)
535 		return (EINVAL);
536 	error = intr_event_describe_handler(isrc->is_event, ih, descr);
537 	if (error)
538 		return (error);
539 	intrcnt_updatename(isrc);
540 	return (0);
541 }
542 
543 void
544 intr_reprogram(void)
545 {
546 	struct intsrc *is;
547 	u_int v;
548 
549 	sx_xlock(&intrsrc_lock);
550 	for (v = 0; v < num_io_irqs; v++) {
551 		is = interrupt_sources[v];
552 		if (is == NULL)
553 			continue;
554 		if (is->is_pic->pic_reprogram_pin != NULL)
555 			is->is_pic->pic_reprogram_pin(is);
556 	}
557 	sx_xunlock(&intrsrc_lock);
558 }
559 
560 #ifdef DDB
561 /*
562  * Dump data about interrupt handlers
563  */
564 DB_SHOW_COMMAND(irqs, db_show_irqs)
565 {
566 	struct intsrc **isrc;
567 	u_int i;
568 	int verbose;
569 
570 	if (strcmp(modif, "v") == 0)
571 		verbose = 1;
572 	else
573 		verbose = 0;
574 	isrc = interrupt_sources;
575 	for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++)
576 		if (*isrc != NULL)
577 			db_dump_intr_event((*isrc)->is_event, verbose);
578 }
579 #endif
580 
581 #ifdef SMP
582 /*
583  * Support for balancing interrupt sources across CPUs.  For now we just
584  * allocate CPUs round-robin.
585  */
586 
587 cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
588 static int current_cpu[MAXMEMDOM];
589 
590 static void
591 intr_init_cpus(void)
592 {
593 	int i;
594 
595 	for (i = 0; i < vm_ndomains; i++) {
596 		current_cpu[i] = 0;
597 		if (!CPU_ISSET(current_cpu[i], &intr_cpus) ||
598 		    !CPU_ISSET(current_cpu[i], &cpuset_domain[i]))
599 			intr_next_cpu(i);
600 	}
601 }
602 
603 /*
604  * Return the CPU that the next interrupt source should use.  For now
605  * this just returns the next local APIC according to round-robin.
606  */
607 u_int
608 intr_next_cpu(int domain)
609 {
610 	u_int apic_id;
611 
612 #ifdef EARLY_AP_STARTUP
613 	MPASS(mp_ncpus == 1 || smp_started);
614 	if (mp_ncpus == 1)
615 		return (PCPU_GET(apic_id));
616 #else
617 	/* Leave all interrupts on the BSP during boot. */
618 	if (!assign_cpu)
619 		return (PCPU_GET(apic_id));
620 #endif
621 
622 	mtx_lock_spin(&icu_lock);
623 	apic_id = cpu_apic_ids[current_cpu[domain]];
624 	do {
625 		current_cpu[domain]++;
626 		if (current_cpu[domain] > mp_maxid)
627 			current_cpu[domain] = 0;
628 	} while (!CPU_ISSET(current_cpu[domain], &intr_cpus) ||
629 	    !CPU_ISSET(current_cpu[domain], &cpuset_domain[domain]));
630 	mtx_unlock_spin(&icu_lock);
631 	return (apic_id);
632 }
633 
634 /* Attempt to bind the specified IRQ to the specified CPU. */
635 int
636 intr_bind(u_int vector, u_char cpu)
637 {
638 	struct intsrc *isrc;
639 
640 	isrc = intr_lookup_source(vector);
641 	if (isrc == NULL)
642 		return (EINVAL);
643 	return (intr_event_bind(isrc->is_event, cpu));
644 }
645 
646 /*
647  * Add a CPU to our mask of valid CPUs that can be destinations of
648  * interrupts.
649  */
650 void
651 intr_add_cpu(u_int cpu)
652 {
653 
654 	if (cpu >= MAXCPU)
655 		panic("%s: Invalid CPU ID", __func__);
656 	if (bootverbose)
657 		printf("INTR: Adding local APIC %d as a target\n",
658 		    cpu_apic_ids[cpu]);
659 
660 	CPU_SET(cpu, &intr_cpus);
661 }
662 
663 #ifdef EARLY_AP_STARTUP
664 static void
665 intr_smp_startup(void *arg __unused)
666 {
667 
668 	intr_init_cpus();
669 	return;
670 }
671 SYSINIT(intr_smp_startup, SI_SUB_SMP, SI_ORDER_SECOND, intr_smp_startup,
672     NULL);
673 
674 #else
675 /*
676  * Distribute all the interrupt sources among the available CPUs once the
677  * AP's have been launched.
678  */
679 static void
680 intr_shuffle_irqs(void *arg __unused)
681 {
682 	struct intsrc *isrc;
683 	u_int cpu, i;
684 
685 	intr_init_cpus();
686 	/* Don't bother on UP. */
687 	if (mp_ncpus == 1)
688 		return;
689 
690 	/* Round-robin assign a CPU to each enabled source. */
691 	sx_xlock(&intrsrc_lock);
692 	assign_cpu = 1;
693 	for (i = 0; i < num_io_irqs; i++) {
694 		isrc = interrupt_sources[i];
695 		if (isrc != NULL && isrc->is_handlers > 0) {
696 			/*
697 			 * If this event is already bound to a CPU,
698 			 * then assign the source to that CPU instead
699 			 * of picking one via round-robin.  Note that
700 			 * this is careful to only advance the
701 			 * round-robin if the CPU assignment succeeds.
702 			 */
703 			cpu = isrc->is_event->ie_cpu;
704 			if (cpu == NOCPU)
705 				cpu = current_cpu[isrc->is_domain];
706 			if (isrc->is_pic->pic_assign_cpu(isrc,
707 			    cpu_apic_ids[cpu]) == 0) {
708 				isrc->is_cpu = cpu;
709 				if (isrc->is_event->ie_cpu == NOCPU)
710 					intr_next_cpu(isrc->is_domain);
711 			}
712 		}
713 	}
714 	sx_xunlock(&intrsrc_lock);
715 }
716 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
717     NULL);
718 #endif
719 
720 /*
721  * TODO: Export this information in a non-MD fashion, integrate with vmstat -i.
722  */
723 static int
724 sysctl_hw_intrs(SYSCTL_HANDLER_ARGS)
725 {
726 	struct sbuf sbuf;
727 	struct intsrc *isrc;
728 	u_int i;
729 	int error;
730 
731 	error = sysctl_wire_old_buffer(req, 0);
732 	if (error != 0)
733 		return (error);
734 
735 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
736 	sx_slock(&intrsrc_lock);
737 	for (i = 0; i < num_io_irqs; i++) {
738 		isrc = interrupt_sources[i];
739 		if (isrc == NULL)
740 			continue;
741 		sbuf_printf(&sbuf, "%s:%d @cpu%d(domain%d): %ld\n",
742 		    isrc->is_event->ie_fullname,
743 		    isrc->is_index,
744 		    isrc->is_cpu,
745 		    isrc->is_domain,
746 		    *isrc->is_count);
747 	}
748 
749 	sx_sunlock(&intrsrc_lock);
750 	error = sbuf_finish(&sbuf);
751 	sbuf_delete(&sbuf);
752 	return (error);
753 }
754 SYSCTL_PROC(_hw, OID_AUTO, intrs, CTLTYPE_STRING | CTLFLAG_RW,
755     0, 0, sysctl_hw_intrs, "A", "interrupt:number @cpu: count");
756 
757 /*
758  * Compare two, possibly NULL, entries in the interrupt source array
759  * by load.
760  */
761 static int
762 intrcmp(const void *one, const void *two)
763 {
764 	const struct intsrc *i1, *i2;
765 
766 	i1 = *(const struct intsrc * const *)one;
767 	i2 = *(const struct intsrc * const *)two;
768 	if (i1 != NULL && i2 != NULL)
769 		return (*i1->is_count - *i2->is_count);
770 	if (i1 != NULL)
771 		return (1);
772 	if (i2 != NULL)
773 		return (-1);
774 	return (0);
775 }
776 
777 /*
778  * Balance IRQs across available CPUs according to load.
779  */
780 static void
781 intr_balance(void *dummy __unused, int pending __unused)
782 {
783 	struct intsrc *isrc;
784 	int interval;
785 	u_int cpu;
786 	int i;
787 
788 	interval = intrbalance;
789 	if (interval == 0)
790 		goto out;
791 
792 	/*
793 	 * Sort interrupts according to count.
794 	 */
795 	sx_xlock(&intrsrc_lock);
796 	memcpy(interrupt_sorted, interrupt_sources, num_io_irqs *
797 	    sizeof(interrupt_sorted[0]));
798 	qsort(interrupt_sorted, num_io_irqs, sizeof(interrupt_sorted[0]),
799 	    intrcmp);
800 
801 	/*
802 	 * Restart the scan from the same location to avoid moving in the
803 	 * common case.
804 	 */
805 	intr_init_cpus();
806 
807 	/*
808 	 * Assign round-robin from most loaded to least.
809 	 */
810 	for (i = num_io_irqs - 1; i >= 0; i--) {
811 		isrc = interrupt_sorted[i];
812 		if (isrc == NULL  || isrc->is_event->ie_cpu != NOCPU)
813 			continue;
814 		cpu = current_cpu[isrc->is_domain];
815 		intr_next_cpu(isrc->is_domain);
816 		if (isrc->is_cpu != cpu &&
817 		    isrc->is_pic->pic_assign_cpu(isrc,
818 		    cpu_apic_ids[cpu]) == 0)
819 			isrc->is_cpu = cpu;
820 	}
821 	sx_xunlock(&intrsrc_lock);
822 out:
823 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task,
824 	    interval ? hz * interval : hz * 60);
825 
826 }
827 
828 static void
829 intr_balance_init(void *dummy __unused)
830 {
831 
832 	TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance,
833 	    NULL);
834 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz);
835 }
836 SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL);
837 
838 #else
839 /*
840  * Always route interrupts to the current processor in the UP case.
841  */
842 u_int
843 intr_next_cpu(int domain)
844 {
845 
846 	return (PCPU_GET(apic_id));
847 }
848 #endif
849