xref: /qemu/hw/pci/msix.c (revision 9c4218e9)
1 /*
2  * MSI-X device support
3  *
4  * This module includes support for MSI-X in pci devices.
5  *
6  * Author: Michael S. Tsirkin <mst@redhat.com>
7  *
8  *  Copyright (c) 2009, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com)
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2.  See
11  * the COPYING file in the top-level directory.
12  *
13  * Contributions after 2012-01-13 are licensed under the terms of the
14  * GNU GPL, version 2 or (at your option) any later version.
15  */
16 
17 #include "qemu/osdep.h"
18 #include "hw/hw.h"
19 #include "hw/pci/msi.h"
20 #include "hw/pci/msix.h"
21 #include "hw/pci/pci.h"
22 #include "qemu/range.h"
23 
24 #define MSIX_CAP_LENGTH 12
25 
26 /* MSI enable bit and maskall bit are in byte 1 in FLAGS register */
27 #define MSIX_CONTROL_OFFSET (PCI_MSIX_FLAGS + 1)
28 #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
29 #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8)
30 
31 MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
32 {
33     uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
34     MSIMessage msg;
35 
36     msg.address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR);
37     msg.data = pci_get_long(table_entry + PCI_MSIX_ENTRY_DATA);
38     return msg;
39 }
40 
41 /*
42  * Special API for POWER to configure the vectors through
43  * a side channel. Should never be used by devices.
44  */
45 void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
46 {
47     uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
48 
49     pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
50     pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
51     table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
52 }
53 
54 static uint8_t msix_pending_mask(int vector)
55 {
56     return 1 << (vector % 8);
57 }
58 
59 static uint8_t *msix_pending_byte(PCIDevice *dev, int vector)
60 {
61     return dev->msix_pba + vector / 8;
62 }
63 
64 static int msix_is_pending(PCIDevice *dev, int vector)
65 {
66     return *msix_pending_byte(dev, vector) & msix_pending_mask(vector);
67 }
68 
69 void msix_set_pending(PCIDevice *dev, unsigned int vector)
70 {
71     *msix_pending_byte(dev, vector) |= msix_pending_mask(vector);
72 }
73 
74 static void msix_clr_pending(PCIDevice *dev, int vector)
75 {
76     *msix_pending_byte(dev, vector) &= ~msix_pending_mask(vector);
77 }
78 
79 static bool msix_vector_masked(PCIDevice *dev, unsigned int vector, bool fmask)
80 {
81     unsigned offset = vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
82     return fmask || dev->msix_table[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
83 }
84 
85 bool msix_is_masked(PCIDevice *dev, unsigned int vector)
86 {
87     return msix_vector_masked(dev, vector, dev->msix_function_masked);
88 }
89 
90 static void msix_fire_vector_notifier(PCIDevice *dev,
91                                       unsigned int vector, bool is_masked)
92 {
93     MSIMessage msg;
94     int ret;
95 
96     if (!dev->msix_vector_use_notifier) {
97         return;
98     }
99     if (is_masked) {
100         dev->msix_vector_release_notifier(dev, vector);
101     } else {
102         msg = msix_get_message(dev, vector);
103         ret = dev->msix_vector_use_notifier(dev, vector, msg);
104         assert(ret >= 0);
105     }
106 }
107 
108 static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked)
109 {
110     bool is_masked = msix_is_masked(dev, vector);
111 
112     if (is_masked == was_masked) {
113         return;
114     }
115 
116     msix_fire_vector_notifier(dev, vector, is_masked);
117 
118     if (!is_masked && msix_is_pending(dev, vector)) {
119         msix_clr_pending(dev, vector);
120         msix_notify(dev, vector);
121     }
122 }
123 
124 static void msix_update_function_masked(PCIDevice *dev)
125 {
126     dev->msix_function_masked = !msix_enabled(dev) ||
127         (dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK);
128 }
129 
130 /* Handle MSI-X capability config write. */
131 void msix_write_config(PCIDevice *dev, uint32_t addr,
132                        uint32_t val, int len)
133 {
134     unsigned enable_pos = dev->msix_cap + MSIX_CONTROL_OFFSET;
135     int vector;
136     bool was_masked;
137 
138     if (!msix_present(dev) || !range_covers_byte(addr, len, enable_pos)) {
139         return;
140     }
141 
142     was_masked = dev->msix_function_masked;
143     msix_update_function_masked(dev);
144 
145     if (!msix_enabled(dev)) {
146         return;
147     }
148 
149     pci_device_deassert_intx(dev);
150 
151     if (dev->msix_function_masked == was_masked) {
152         return;
153     }
154 
155     for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
156         msix_handle_mask_update(dev, vector,
157                                 msix_vector_masked(dev, vector, was_masked));
158     }
159 }
160 
161 static uint64_t msix_table_mmio_read(void *opaque, hwaddr addr,
162                                      unsigned size)
163 {
164     PCIDevice *dev = opaque;
165 
166     return pci_get_long(dev->msix_table + addr);
167 }
168 
169 static void msix_table_mmio_write(void *opaque, hwaddr addr,
170                                   uint64_t val, unsigned size)
171 {
172     PCIDevice *dev = opaque;
173     int vector = addr / PCI_MSIX_ENTRY_SIZE;
174     bool was_masked;
175 
176     was_masked = msix_is_masked(dev, vector);
177     pci_set_long(dev->msix_table + addr, val);
178     msix_handle_mask_update(dev, vector, was_masked);
179 }
180 
181 static const MemoryRegionOps msix_table_mmio_ops = {
182     .read = msix_table_mmio_read,
183     .write = msix_table_mmio_write,
184     .endianness = DEVICE_LITTLE_ENDIAN,
185     .valid = {
186         .min_access_size = 4,
187         .max_access_size = 4,
188     },
189 };
190 
191 static uint64_t msix_pba_mmio_read(void *opaque, hwaddr addr,
192                                    unsigned size)
193 {
194     PCIDevice *dev = opaque;
195     if (dev->msix_vector_poll_notifier) {
196         unsigned vector_start = addr * 8;
197         unsigned vector_end = MIN(addr + size * 8, dev->msix_entries_nr);
198         dev->msix_vector_poll_notifier(dev, vector_start, vector_end);
199     }
200 
201     return pci_get_long(dev->msix_pba + addr);
202 }
203 
204 static void msix_pba_mmio_write(void *opaque, hwaddr addr,
205                                 uint64_t val, unsigned size)
206 {
207 }
208 
209 static const MemoryRegionOps msix_pba_mmio_ops = {
210     .read = msix_pba_mmio_read,
211     .write = msix_pba_mmio_write,
212     .endianness = DEVICE_LITTLE_ENDIAN,
213     .valid = {
214         .min_access_size = 4,
215         .max_access_size = 4,
216     },
217 };
218 
219 static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
220 {
221     int vector;
222 
223     for (vector = 0; vector < nentries; ++vector) {
224         unsigned offset =
225             vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
226         bool was_masked = msix_is_masked(dev, vector);
227 
228         dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
229         msix_handle_mask_update(dev, vector, was_masked);
230     }
231 }
232 
233 /* Initialize the MSI-X structures */
234 int msix_init(struct PCIDevice *dev, unsigned short nentries,
235               MemoryRegion *table_bar, uint8_t table_bar_nr,
236               unsigned table_offset, MemoryRegion *pba_bar,
237               uint8_t pba_bar_nr, unsigned pba_offset, uint8_t cap_pos)
238 {
239     int cap;
240     unsigned table_size, pba_size;
241     uint8_t *config;
242 
243     /* Nothing to do if MSI is not supported by interrupt controller */
244     if (!msi_supported) {
245         return -ENOTSUP;
246     }
247 
248     if (nentries < 1 || nentries > PCI_MSIX_FLAGS_QSIZE + 1) {
249         return -EINVAL;
250     }
251 
252     table_size = nentries * PCI_MSIX_ENTRY_SIZE;
253     pba_size = QEMU_ALIGN_UP(nentries, 64) / 8;
254 
255     /* Sanity test: table & pba don't overlap, fit within BARs, min aligned */
256     if ((table_bar_nr == pba_bar_nr &&
257          ranges_overlap(table_offset, table_size, pba_offset, pba_size)) ||
258         table_offset + table_size > memory_region_size(table_bar) ||
259         pba_offset + pba_size > memory_region_size(pba_bar) ||
260         (table_offset | pba_offset) & PCI_MSIX_FLAGS_BIRMASK) {
261         return -EINVAL;
262     }
263 
264     cap = pci_add_capability(dev, PCI_CAP_ID_MSIX, cap_pos, MSIX_CAP_LENGTH);
265     if (cap < 0) {
266         return cap;
267     }
268 
269     dev->msix_cap = cap;
270     dev->cap_present |= QEMU_PCI_CAP_MSIX;
271     config = dev->config + cap;
272 
273     pci_set_word(config + PCI_MSIX_FLAGS, nentries - 1);
274     dev->msix_entries_nr = nentries;
275     dev->msix_function_masked = true;
276 
277     pci_set_long(config + PCI_MSIX_TABLE, table_offset | table_bar_nr);
278     pci_set_long(config + PCI_MSIX_PBA, pba_offset | pba_bar_nr);
279 
280     /* Make flags bit writable. */
281     dev->wmask[cap + MSIX_CONTROL_OFFSET] |= MSIX_ENABLE_MASK |
282                                              MSIX_MASKALL_MASK;
283 
284     dev->msix_table = g_malloc0(table_size);
285     dev->msix_pba = g_malloc0(pba_size);
286     dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
287 
288     msix_mask_all(dev, nentries);
289 
290     memory_region_init_io(&dev->msix_table_mmio, OBJECT(dev), &msix_table_mmio_ops, dev,
291                           "msix-table", table_size);
292     memory_region_add_subregion(table_bar, table_offset, &dev->msix_table_mmio);
293     memory_region_init_io(&dev->msix_pba_mmio, OBJECT(dev), &msix_pba_mmio_ops, dev,
294                           "msix-pba", pba_size);
295     memory_region_add_subregion(pba_bar, pba_offset, &dev->msix_pba_mmio);
296 
297     return 0;
298 }
299 
300 int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries,
301                             uint8_t bar_nr)
302 {
303     int ret;
304     char *name;
305     uint32_t bar_size = 4096;
306     uint32_t bar_pba_offset = bar_size / 2;
307     uint32_t bar_pba_size = (nentries / 8 + 1) * 8;
308 
309     /*
310      * Migration compatibility dictates that this remains a 4k
311      * BAR with the vector table in the lower half and PBA in
312      * the upper half for nentries which is lower or equal to 128.
313      * No need to care about using more than 65 entries for legacy
314      * machine types who has at most 64 queues.
315      */
316     if (nentries * PCI_MSIX_ENTRY_SIZE > bar_pba_offset) {
317         bar_pba_offset = nentries * PCI_MSIX_ENTRY_SIZE;
318     }
319 
320     if (bar_pba_offset + bar_pba_size > 4096) {
321         bar_size = bar_pba_offset + bar_pba_size;
322     }
323 
324     bar_size = pow2ceil(bar_size);
325 
326     name = g_strdup_printf("%s-msix", dev->name);
327     memory_region_init(&dev->msix_exclusive_bar, OBJECT(dev), name, bar_size);
328     g_free(name);
329 
330     ret = msix_init(dev, nentries, &dev->msix_exclusive_bar, bar_nr,
331                     0, &dev->msix_exclusive_bar,
332                     bar_nr, bar_pba_offset,
333                     0);
334     if (ret) {
335         return ret;
336     }
337 
338     pci_register_bar(dev, bar_nr, PCI_BASE_ADDRESS_SPACE_MEMORY,
339                      &dev->msix_exclusive_bar);
340 
341     return 0;
342 }
343 
344 static void msix_free_irq_entries(PCIDevice *dev)
345 {
346     int vector;
347 
348     for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
349         dev->msix_entry_used[vector] = 0;
350         msix_clr_pending(dev, vector);
351     }
352 }
353 
354 static void msix_clear_all_vectors(PCIDevice *dev)
355 {
356     int vector;
357 
358     for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
359         msix_clr_pending(dev, vector);
360     }
361 }
362 
363 /* Clean up resources for the device. */
364 void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
365 {
366     if (!msix_present(dev)) {
367         return;
368     }
369     pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
370     dev->msix_cap = 0;
371     msix_free_irq_entries(dev);
372     dev->msix_entries_nr = 0;
373     memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio);
374     g_free(dev->msix_pba);
375     dev->msix_pba = NULL;
376     memory_region_del_subregion(table_bar, &dev->msix_table_mmio);
377     g_free(dev->msix_table);
378     dev->msix_table = NULL;
379     g_free(dev->msix_entry_used);
380     dev->msix_entry_used = NULL;
381     dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
382 }
383 
384 void msix_uninit_exclusive_bar(PCIDevice *dev)
385 {
386     if (msix_present(dev)) {
387         msix_uninit(dev, &dev->msix_exclusive_bar, &dev->msix_exclusive_bar);
388     }
389 }
390 
391 void msix_save(PCIDevice *dev, QEMUFile *f)
392 {
393     unsigned n = dev->msix_entries_nr;
394 
395     if (!msix_present(dev)) {
396         return;
397     }
398 
399     qemu_put_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
400     qemu_put_buffer(f, dev->msix_pba, (n + 7) / 8);
401 }
402 
403 /* Should be called after restoring the config space. */
404 void msix_load(PCIDevice *dev, QEMUFile *f)
405 {
406     unsigned n = dev->msix_entries_nr;
407     unsigned int vector;
408 
409     if (!msix_present(dev)) {
410         return;
411     }
412 
413     msix_clear_all_vectors(dev);
414     qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
415     qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
416     msix_update_function_masked(dev);
417 
418     for (vector = 0; vector < n; vector++) {
419         msix_handle_mask_update(dev, vector, true);
420     }
421 }
422 
423 /* Does device support MSI-X? */
424 int msix_present(PCIDevice *dev)
425 {
426     return dev->cap_present & QEMU_PCI_CAP_MSIX;
427 }
428 
429 /* Is MSI-X enabled? */
430 int msix_enabled(PCIDevice *dev)
431 {
432     return (dev->cap_present & QEMU_PCI_CAP_MSIX) &&
433         (dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
434          MSIX_ENABLE_MASK);
435 }
436 
437 /* Send an MSI-X message */
438 void msix_notify(PCIDevice *dev, unsigned vector)
439 {
440     MSIMessage msg;
441 
442     if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
443         return;
444     if (msix_is_masked(dev, vector)) {
445         msix_set_pending(dev, vector);
446         return;
447     }
448 
449     msg = msix_get_message(dev, vector);
450 
451     msi_send_message(dev, msg);
452 }
453 
454 void msix_reset(PCIDevice *dev)
455 {
456     if (!msix_present(dev)) {
457         return;
458     }
459     msix_clear_all_vectors(dev);
460     dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
461 	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
462     memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
463     memset(dev->msix_pba, 0, QEMU_ALIGN_UP(dev->msix_entries_nr, 64) / 8);
464     msix_mask_all(dev, dev->msix_entries_nr);
465 }
466 
467 /* PCI spec suggests that devices make it possible for software to configure
468  * less vectors than supported by the device, but does not specify a standard
469  * mechanism for devices to do so.
470  *
471  * We support this by asking devices to declare vectors software is going to
472  * actually use, and checking this on the notification path. Devices that
473  * don't want to follow the spec suggestion can declare all vectors as used. */
474 
475 /* Mark vector as used. */
476 int msix_vector_use(PCIDevice *dev, unsigned vector)
477 {
478     if (vector >= dev->msix_entries_nr)
479         return -EINVAL;
480     dev->msix_entry_used[vector]++;
481     return 0;
482 }
483 
484 /* Mark vector as unused. */
485 void msix_vector_unuse(PCIDevice *dev, unsigned vector)
486 {
487     if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) {
488         return;
489     }
490     if (--dev->msix_entry_used[vector]) {
491         return;
492     }
493     msix_clr_pending(dev, vector);
494 }
495 
496 void msix_unuse_all_vectors(PCIDevice *dev)
497 {
498     if (!msix_present(dev)) {
499         return;
500     }
501     msix_free_irq_entries(dev);
502 }
503 
504 unsigned int msix_nr_vectors_allocated(const PCIDevice *dev)
505 {
506     return dev->msix_entries_nr;
507 }
508 
509 static int msix_set_notifier_for_vector(PCIDevice *dev, unsigned int vector)
510 {
511     MSIMessage msg;
512 
513     if (msix_is_masked(dev, vector)) {
514         return 0;
515     }
516     msg = msix_get_message(dev, vector);
517     return dev->msix_vector_use_notifier(dev, vector, msg);
518 }
519 
520 static void msix_unset_notifier_for_vector(PCIDevice *dev, unsigned int vector)
521 {
522     if (msix_is_masked(dev, vector)) {
523         return;
524     }
525     dev->msix_vector_release_notifier(dev, vector);
526 }
527 
528 int msix_set_vector_notifiers(PCIDevice *dev,
529                               MSIVectorUseNotifier use_notifier,
530                               MSIVectorReleaseNotifier release_notifier,
531                               MSIVectorPollNotifier poll_notifier)
532 {
533     int vector, ret;
534 
535     assert(use_notifier && release_notifier);
536 
537     dev->msix_vector_use_notifier = use_notifier;
538     dev->msix_vector_release_notifier = release_notifier;
539     dev->msix_vector_poll_notifier = poll_notifier;
540 
541     if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
542         (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
543         for (vector = 0; vector < dev->msix_entries_nr; vector++) {
544             ret = msix_set_notifier_for_vector(dev, vector);
545             if (ret < 0) {
546                 goto undo;
547             }
548         }
549     }
550     if (dev->msix_vector_poll_notifier) {
551         dev->msix_vector_poll_notifier(dev, 0, dev->msix_entries_nr);
552     }
553     return 0;
554 
555 undo:
556     while (--vector >= 0) {
557         msix_unset_notifier_for_vector(dev, vector);
558     }
559     dev->msix_vector_use_notifier = NULL;
560     dev->msix_vector_release_notifier = NULL;
561     return ret;
562 }
563 
564 void msix_unset_vector_notifiers(PCIDevice *dev)
565 {
566     int vector;
567 
568     assert(dev->msix_vector_use_notifier &&
569            dev->msix_vector_release_notifier);
570 
571     if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
572         (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
573         for (vector = 0; vector < dev->msix_entries_nr; vector++) {
574             msix_unset_notifier_for_vector(dev, vector);
575         }
576     }
577     dev->msix_vector_use_notifier = NULL;
578     dev->msix_vector_release_notifier = NULL;
579     dev->msix_vector_poll_notifier = NULL;
580 }
581 
582 static void put_msix_state(QEMUFile *f, void *pv, size_t size)
583 {
584     msix_save(pv, f);
585 }
586 
587 static int get_msix_state(QEMUFile *f, void *pv, size_t size)
588 {
589     msix_load(pv, f);
590     return 0;
591 }
592 
593 static VMStateInfo vmstate_info_msix = {
594     .name = "msix state",
595     .get  = get_msix_state,
596     .put  = put_msix_state,
597 };
598 
599 const VMStateDescription vmstate_msix = {
600     .name = "msix",
601     .fields = (VMStateField[]) {
602         {
603             .name         = "msix",
604             .version_id   = 0,
605             .field_exists = NULL,
606             .size         = 0,   /* ouch */
607             .info         = &vmstate_info_msix,
608             .flags        = VMS_SINGLE,
609             .offset       = 0,
610         },
611         VMSTATE_END_OF_LIST()
612     }
613 };
614