xref: /qemu/hw/intc/ioapic.c (revision 83ecdb18)
1 /*
2  *  ioapic.c IOAPIC emulation logic
3  *
4  *  Copyright (c) 2004-2005 Fabrice Bellard
5  *
6  *  Split the ioapic logic from apic.c
7  *  Xiantao Zhang <xiantao.zhang@intel.com>
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 #include "qemu/osdep.h"
24 #include "qapi/error.h"
25 #include "monitor/monitor.h"
26 #include "hw/i386/apic.h"
27 #include "hw/i386/x86.h"
28 #include "hw/intc/i8259.h"
29 #include "hw/intc/ioapic.h"
30 #include "hw/intc/ioapic_internal.h"
31 #include "hw/pci/msi.h"
32 #include "hw/qdev-properties.h"
33 #include "sysemu/kvm.h"
34 #include "sysemu/sysemu.h"
35 #include "hw/i386/apic-msidef.h"
36 #include "hw/i386/x86-iommu.h"
37 #include "trace.h"
38 
39 #define APIC_DELIVERY_MODE_SHIFT 8
40 #define APIC_POLARITY_SHIFT 14
41 #define APIC_TRIG_MODE_SHIFT 15
42 
43 static IOAPICCommonState *ioapics[MAX_IOAPICS];
44 
45 /* global variable from ioapic_common.c */
46 extern int ioapic_no;
47 
48 struct ioapic_entry_info {
49     /* fields parsed from IOAPIC entries */
50     uint8_t masked;
51     uint8_t trig_mode;
52     uint16_t dest_idx;
53     uint8_t dest_mode;
54     uint8_t delivery_mode;
55     uint8_t vector;
56 
57     /* MSI message generated from above parsed fields */
58     uint32_t addr;
59     uint32_t data;
60 };
61 
62 static void ioapic_entry_parse(uint64_t entry, struct ioapic_entry_info *info)
63 {
64     memset(info, 0, sizeof(*info));
65     info->masked = (entry >> IOAPIC_LVT_MASKED_SHIFT) & 1;
66     info->trig_mode = (entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1;
67     /*
68      * By default, this would be dest_id[8] + reserved[8]. When IR
69      * is enabled, this would be interrupt_index[15] +
70      * interrupt_format[1]. This field never means anything, but
71      * only used to generate corresponding MSI.
72      */
73     info->dest_idx = (entry >> IOAPIC_LVT_DEST_IDX_SHIFT) & 0xffff;
74     info->dest_mode = (entry >> IOAPIC_LVT_DEST_MODE_SHIFT) & 1;
75     info->delivery_mode = (entry >> IOAPIC_LVT_DELIV_MODE_SHIFT) \
76         & IOAPIC_DM_MASK;
77     if (info->delivery_mode == IOAPIC_DM_EXTINT) {
78         info->vector = pic_read_irq(isa_pic);
79     } else {
80         info->vector = entry & IOAPIC_VECTOR_MASK;
81     }
82 
83     info->addr = APIC_DEFAULT_ADDRESS | \
84         (info->dest_idx << MSI_ADDR_DEST_IDX_SHIFT) | \
85         (info->dest_mode << MSI_ADDR_DEST_MODE_SHIFT);
86     info->data = (info->vector << MSI_DATA_VECTOR_SHIFT) | \
87         (info->trig_mode << MSI_DATA_TRIGGER_SHIFT) | \
88         (info->delivery_mode << MSI_DATA_DELIVERY_MODE_SHIFT);
89 }
90 
91 static void ioapic_service(IOAPICCommonState *s)
92 {
93     AddressSpace *ioapic_as = X86_MACHINE(qdev_get_machine())->ioapic_as;
94     struct ioapic_entry_info info;
95     uint8_t i;
96     uint32_t mask;
97     uint64_t entry;
98 
99     for (i = 0; i < IOAPIC_NUM_PINS; i++) {
100         mask = 1 << i;
101         if (s->irr & mask) {
102             int coalesce = 0;
103 
104             entry = s->ioredtbl[i];
105             ioapic_entry_parse(entry, &info);
106             if (!info.masked) {
107                 if (info.trig_mode == IOAPIC_TRIGGER_EDGE) {
108                     s->irr &= ~mask;
109                 } else {
110                     coalesce = s->ioredtbl[i] & IOAPIC_LVT_REMOTE_IRR;
111                     trace_ioapic_set_remote_irr(i);
112                     s->ioredtbl[i] |= IOAPIC_LVT_REMOTE_IRR;
113                 }
114 
115                 if (coalesce) {
116                     /* We are level triggered interrupts, and the
117                      * guest should be still working on previous one,
118                      * so skip it. */
119                     continue;
120                 }
121 
122 #ifdef CONFIG_KVM
123                 if (kvm_irqchip_is_split()) {
124                     if (info.trig_mode == IOAPIC_TRIGGER_EDGE) {
125                         kvm_set_irq(kvm_state, i, 1);
126                         kvm_set_irq(kvm_state, i, 0);
127                     } else {
128                         kvm_set_irq(kvm_state, i, 1);
129                     }
130                     continue;
131                 }
132 #endif
133 
134                 /* No matter whether IR is enabled, we translate
135                  * the IOAPIC message into a MSI one, and its
136                  * address space will decide whether we need a
137                  * translation. */
138                 stl_le_phys(ioapic_as, info.addr, info.data);
139             }
140         }
141     }
142 }
143 
144 #define SUCCESSIVE_IRQ_MAX_COUNT 10000
145 
146 static void delayed_ioapic_service_cb(void *opaque)
147 {
148     IOAPICCommonState *s = opaque;
149 
150     ioapic_service(s);
151 }
152 
153 static void ioapic_set_irq(void *opaque, int vector, int level)
154 {
155     IOAPICCommonState *s = opaque;
156 
157     /* ISA IRQs map to GSI 1-1 except for IRQ0 which maps
158      * to GSI 2.  GSI maps to ioapic 1-1.  This is not
159      * the cleanest way of doing it but it should work. */
160 
161     trace_ioapic_set_irq(vector, level);
162     ioapic_stat_update_irq(s, vector, level);
163     if (vector == 0) {
164         vector = 2;
165     }
166     if (vector < IOAPIC_NUM_PINS) {
167         uint32_t mask = 1 << vector;
168         uint64_t entry = s->ioredtbl[vector];
169 
170         if (((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) ==
171             IOAPIC_TRIGGER_LEVEL) {
172             /* level triggered */
173             if (level) {
174                 s->irr |= mask;
175                 if (!(entry & IOAPIC_LVT_REMOTE_IRR)) {
176                     ioapic_service(s);
177                 }
178             } else {
179                 s->irr &= ~mask;
180             }
181         } else {
182             /* According to the 82093AA manual, we must ignore edge requests
183              * if the input pin is masked. */
184             if (level && !(entry & IOAPIC_LVT_MASKED)) {
185                 s->irr |= mask;
186                 ioapic_service(s);
187             }
188         }
189     }
190 }
191 
192 static void ioapic_update_kvm_routes(IOAPICCommonState *s)
193 {
194 #ifdef CONFIG_KVM
195     int i;
196 
197     if (kvm_irqchip_is_split()) {
198         for (i = 0; i < IOAPIC_NUM_PINS; i++) {
199             MSIMessage msg;
200             struct ioapic_entry_info info;
201             ioapic_entry_parse(s->ioredtbl[i], &info);
202             if (!info.masked) {
203                 msg.address = info.addr;
204                 msg.data = info.data;
205                 kvm_irqchip_update_msi_route(kvm_state, i, msg, NULL);
206             }
207         }
208         kvm_irqchip_commit_routes(kvm_state);
209     }
210 #endif
211 }
212 
213 #ifdef CONFIG_KVM
214 static void ioapic_iec_notifier(void *private, bool global,
215                                 uint32_t index, uint32_t mask)
216 {
217     IOAPICCommonState *s = (IOAPICCommonState *)private;
218     /* For simplicity, we just update all the routes */
219     ioapic_update_kvm_routes(s);
220 }
221 #endif
222 
223 void ioapic_eoi_broadcast(int vector)
224 {
225     IOAPICCommonState *s;
226     uint64_t entry;
227     int i, n;
228 
229     trace_ioapic_eoi_broadcast(vector);
230 
231     for (i = 0; i < MAX_IOAPICS; i++) {
232         s = ioapics[i];
233         if (!s) {
234             continue;
235         }
236         for (n = 0; n < IOAPIC_NUM_PINS; n++) {
237             entry = s->ioredtbl[n];
238 
239             if ((entry & IOAPIC_VECTOR_MASK) != vector ||
240                 ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
241                 continue;
242             }
243 
244 #ifdef CONFIG_KVM
245             /*
246              * When IOAPIC is in the userspace while APIC is still in
247              * the kernel (i.e., split irqchip), we have a trick to
248              * kick the resamplefd logic for registered irqfds from
249              * userspace to deactivate the IRQ.  When that happens, it
250              * means the irq bypassed userspace IOAPIC (so the irr and
251              * remote-irr of the table entry should be bypassed too
252              * even if interrupt come).  Still kick the resamplefds if
253              * they're bound to the IRQ, to make sure to EOI the
254              * interrupt for the hardware correctly.
255              *
256              * Note: We still need to go through the irr & remote-irr
257              * operations below because we don't know whether there're
258              * emulated devices that are using/sharing the same IRQ.
259              */
260             kvm_resample_fd_notify(n);
261 #endif
262 
263             if (!(entry & IOAPIC_LVT_REMOTE_IRR)) {
264                 continue;
265             }
266 
267             trace_ioapic_clear_remote_irr(n, vector);
268             s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR;
269 
270             if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) {
271                 ++s->irq_eoi[n];
272                 if (s->irq_eoi[n] >= SUCCESSIVE_IRQ_MAX_COUNT) {
273                     /*
274                      * Real hardware does not deliver the interrupt immediately
275                      * during eoi broadcast, and this lets a buggy guest make
276                      * slow progress even if it does not correctly handle a
277                      * level-triggered interrupt. Emulate this behavior if we
278                      * detect an interrupt storm.
279                      */
280                     s->irq_eoi[n] = 0;
281                     timer_mod_anticipate(s->delayed_ioapic_service_timer,
282                                          qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
283                                          NANOSECONDS_PER_SECOND / 100);
284                     trace_ioapic_eoi_delayed_reassert(n);
285                 } else {
286                     ioapic_service(s);
287                 }
288             } else {
289                 s->irq_eoi[n] = 0;
290             }
291         }
292     }
293 }
294 
295 static uint64_t
296 ioapic_mem_read(void *opaque, hwaddr addr, unsigned int size)
297 {
298     IOAPICCommonState *s = opaque;
299     int index;
300     uint32_t val = 0;
301 
302     addr &= 0xff;
303 
304     switch (addr) {
305     case IOAPIC_IOREGSEL:
306         val = s->ioregsel;
307         break;
308     case IOAPIC_IOWIN:
309         if (size != 4) {
310             break;
311         }
312         switch (s->ioregsel) {
313         case IOAPIC_REG_ID:
314         case IOAPIC_REG_ARB:
315             val = s->id << IOAPIC_ID_SHIFT;
316             break;
317         case IOAPIC_REG_VER:
318             val = s->version |
319                 ((IOAPIC_NUM_PINS - 1) << IOAPIC_VER_ENTRIES_SHIFT);
320             break;
321         default:
322             index = (s->ioregsel - IOAPIC_REG_REDTBL_BASE) >> 1;
323             if (index >= 0 && index < IOAPIC_NUM_PINS) {
324                 if (s->ioregsel & 1) {
325                     val = s->ioredtbl[index] >> 32;
326                 } else {
327                     val = s->ioredtbl[index] & 0xffffffff;
328                 }
329             }
330         }
331         break;
332     }
333 
334     trace_ioapic_mem_read(addr, s->ioregsel, size, val);
335 
336     return val;
337 }
338 
339 /*
340  * This is to satisfy the hack in Linux kernel. One hack of it is to
341  * simulate clearing the Remote IRR bit of IOAPIC entry using the
342  * following:
343  *
344  * "For IO-APIC's with EOI register, we use that to do an explicit EOI.
345  * Otherwise, we simulate the EOI message manually by changing the trigger
346  * mode to edge and then back to level, with RTE being masked during
347  * this."
348  *
349  * (See linux kernel __eoi_ioapic_pin() comment in commit c0205701)
350  *
351  * This is based on the assumption that, Remote IRR bit will be
352  * cleared by IOAPIC hardware when configured as edge-triggered
353  * interrupts.
354  *
355  * Without this, level-triggered interrupts in IR mode might fail to
356  * work correctly.
357  */
358 static inline void
359 ioapic_fix_edge_remote_irr(uint64_t *entry)
360 {
361     if (!(*entry & IOAPIC_LVT_TRIGGER_MODE)) {
362         /* Edge-triggered interrupts, make sure remote IRR is zero */
363         *entry &= ~((uint64_t)IOAPIC_LVT_REMOTE_IRR);
364     }
365 }
366 
367 static void
368 ioapic_mem_write(void *opaque, hwaddr addr, uint64_t val,
369                  unsigned int size)
370 {
371     IOAPICCommonState *s = opaque;
372     int index;
373 
374     addr &= 0xff;
375     trace_ioapic_mem_write(addr, s->ioregsel, size, val);
376 
377     switch (addr) {
378     case IOAPIC_IOREGSEL:
379         s->ioregsel = val;
380         break;
381     case IOAPIC_IOWIN:
382         if (size != 4) {
383             break;
384         }
385         switch (s->ioregsel) {
386         case IOAPIC_REG_ID:
387             s->id = (val >> IOAPIC_ID_SHIFT) & IOAPIC_ID_MASK;
388             break;
389         case IOAPIC_REG_VER:
390         case IOAPIC_REG_ARB:
391             break;
392         default:
393             index = (s->ioregsel - IOAPIC_REG_REDTBL_BASE) >> 1;
394             if (index >= 0 && index < IOAPIC_NUM_PINS) {
395                 uint64_t ro_bits = s->ioredtbl[index] & IOAPIC_RO_BITS;
396                 if (s->ioregsel & 1) {
397                     s->ioredtbl[index] &= 0xffffffff;
398                     s->ioredtbl[index] |= (uint64_t)val << 32;
399                 } else {
400                     s->ioredtbl[index] &= ~0xffffffffULL;
401                     s->ioredtbl[index] |= val;
402                 }
403                 /* restore RO bits */
404                 s->ioredtbl[index] &= IOAPIC_RW_BITS;
405                 s->ioredtbl[index] |= ro_bits;
406                 s->irq_eoi[index] = 0;
407                 ioapic_fix_edge_remote_irr(&s->ioredtbl[index]);
408                 ioapic_update_kvm_routes(s);
409                 ioapic_service(s);
410             }
411         }
412         break;
413     case IOAPIC_EOI:
414         /* Explicit EOI is only supported for IOAPIC version 0x20 */
415         if (size != 4 || s->version != 0x20) {
416             break;
417         }
418         ioapic_eoi_broadcast(val);
419         break;
420     }
421 }
422 
423 static const MemoryRegionOps ioapic_io_ops = {
424     .read = ioapic_mem_read,
425     .write = ioapic_mem_write,
426     .endianness = DEVICE_NATIVE_ENDIAN,
427 };
428 
429 static void ioapic_machine_done_notify(Notifier *notifier, void *data)
430 {
431 #ifdef CONFIG_KVM
432     IOAPICCommonState *s = container_of(notifier, IOAPICCommonState,
433                                         machine_done);
434 
435     if (kvm_irqchip_is_split()) {
436         X86IOMMUState *iommu = x86_iommu_get_default();
437         if (iommu) {
438             /* Register this IOAPIC with IOMMU IEC notifier, so that
439              * when there are IR invalidates, we can be notified to
440              * update kernel IR cache. */
441             x86_iommu_iec_register_notifier(iommu, ioapic_iec_notifier, s);
442         }
443     }
444 #endif
445 }
446 
447 #define IOAPIC_VER_DEF 0x20
448 
449 static void ioapic_realize(DeviceState *dev, Error **errp)
450 {
451     IOAPICCommonState *s = IOAPIC_COMMON(dev);
452 
453     if (s->version != 0x11 && s->version != 0x20) {
454         error_setg(errp, "IOAPIC only supports version 0x11 or 0x20 "
455                    "(default: 0x%x).", IOAPIC_VER_DEF);
456         return;
457     }
458 
459     memory_region_init_io(&s->io_memory, OBJECT(s), &ioapic_io_ops, s,
460                           "ioapic", 0x1000);
461 
462     s->delayed_ioapic_service_timer =
463         timer_new_ns(QEMU_CLOCK_VIRTUAL, delayed_ioapic_service_cb, s);
464 
465     qdev_init_gpio_in(dev, ioapic_set_irq, IOAPIC_NUM_PINS);
466 
467     ioapics[ioapic_no] = s;
468     s->machine_done.notify = ioapic_machine_done_notify;
469     qemu_add_machine_init_done_notifier(&s->machine_done);
470 }
471 
472 static void ioapic_unrealize(DeviceState *dev)
473 {
474     IOAPICCommonState *s = IOAPIC_COMMON(dev);
475 
476     timer_free(s->delayed_ioapic_service_timer);
477 }
478 
479 static Property ioapic_properties[] = {
480     DEFINE_PROP_UINT8("version", IOAPICCommonState, version, IOAPIC_VER_DEF),
481     DEFINE_PROP_END_OF_LIST(),
482 };
483 
484 static void ioapic_class_init(ObjectClass *klass, void *data)
485 {
486     IOAPICCommonClass *k = IOAPIC_COMMON_CLASS(klass);
487     DeviceClass *dc = DEVICE_CLASS(klass);
488 
489     k->realize = ioapic_realize;
490     k->unrealize = ioapic_unrealize;
491     /*
492      * If APIC is in kernel, we need to update the kernel cache after
493      * migration, otherwise first 24 gsi routes will be invalid.
494      */
495     k->post_load = ioapic_update_kvm_routes;
496     dc->reset = ioapic_reset_common;
497     device_class_set_props(dc, ioapic_properties);
498 }
499 
500 static const TypeInfo ioapic_info = {
501     .name          = TYPE_IOAPIC,
502     .parent        = TYPE_IOAPIC_COMMON,
503     .instance_size = sizeof(IOAPICCommonState),
504     .class_init    = ioapic_class_init,
505 };
506 
507 static void ioapic_register_types(void)
508 {
509     type_register_static(&ioapic_info);
510 }
511 
512 type_init(ioapic_register_types)
513