1 /*
2  * virpci.c: helper APIs for managing host PCI devices
3  *
4  * Copyright (C) 2009-2015 Red Hat, Inc.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library.  If not, see
18  * <http://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "virpci.h"
24 #include "virnetdev.h"
25 
26 #include <dirent.h>
27 #include <fcntl.h>
28 #include <inttypes.h>
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 #include <unistd.h>
32 
33 #include "virlog.h"
34 #include "vircommand.h"
35 #include "virerror.h"
36 #include "virfile.h"
37 #include "virkmod.h"
38 #include "virstring.h"
39 #include "viralloc.h"
40 #include "virpcivpd.h"
41 
42 VIR_LOG_INIT("util.pci");
43 
44 #define PCI_SYSFS "/sys/bus/pci/"
45 #define PCI_ID_LEN 10   /* "XXXX XXXX" */
46 
47 VIR_ENUM_IMPL(virPCIELinkSpeed,
48               VIR_PCIE_LINK_SPEED_LAST,
49               "", "2.5", "5", "8", "16",
50 );
51 
52 VIR_ENUM_IMPL(virPCIStubDriver,
53               VIR_PCI_STUB_DRIVER_LAST,
54               "none",
55               "pciback", /* XEN */
56               "vfio-pci", /* VFIO */
57 );
58 
59 VIR_ENUM_IMPL(virPCIHeader,
60               VIR_PCI_HEADER_LAST,
61               "endpoint",
62               "pci-bridge",
63               "cardbus-bridge",
64 );
65 
66 struct _virPCIDevice {
67     virPCIDeviceAddress address;
68 
69     char          *name;              /* domain:bus:slot.function */
70     char          id[PCI_ID_LEN];     /* product vendor */
71     char          *path;
72 
73     /* The driver:domain which uses the device */
74     char          *used_by_drvname;
75     char          *used_by_domname;
76 
77     /* The following 5 items are only valid after virPCIDeviceInit()
78      * has been called for the virPCIDevice object. This is *not* done
79      * in most cases (because it creates extra overhead, and parts of
80      * it can fail if libvirtd is running unprivileged)
81      */
82     unsigned int  pcie_cap_pos;
83     unsigned int  pci_pm_cap_pos;
84     bool          has_flr;
85     bool          has_pm_reset;
86     bool          is_pcie;
87     /**/
88 
89     bool          managed;
90 
91     virPCIStubDriver stubDriver;
92 
93     /* used by reattach function */
94     bool          unbind_from_stub;
95     bool          remove_slot;
96     bool          reprobe;
97 };
98 
99 struct _virPCIDeviceList {
100     virObjectLockable parent;
101 
102     size_t count;
103     virPCIDevice **devs;
104 };
105 
106 
107 #define VIR_FROM_THIS VIR_FROM_NONE
108 
109 /* Specifications referenced in comments:
110  *  PCI30  - PCI Local Bus Specification 3.0
111  *  PCIe20 - PCI Express Base Specification 2.0
112  *  BR12   - PCI-to-PCI Bridge Architecture Specification 1.2
113  *  PM12   - PCI Bus Power Management Interface Specification 1.2
114  *  ECN_AF - Advanced Capabilities for Conventional PCI ECN
115  */
116 
117 /* Type 0 config space header length; PCI30 Section 6.1 Configuration Space Organization */
118 #define PCI_CONF_LEN            0x100
119 #define PCI_CONF_HEADER_LEN     0x40
120 
121 /* PCI30 6.2.1 */
122 #define PCI_HEADER_TYPE         0x0e    /* Header type */
123 #define PCI_HEADER_TYPE_BRIDGE 0x1
124 #define PCI_HEADER_TYPE_MASK   0x7f
125 #define PCI_HEADER_TYPE_MULTI  0x80
126 
127 /* PCI30 6.2.1  Device Identification */
128 #define PCI_CLASS_DEVICE        0x0a    /* Device class */
129 
130 /* Class Code for bridge; PCI30 D.7  Base Class 06h */
131 #define PCI_CLASS_BRIDGE_PCI    0x0604
132 
133 /* PCI30 6.2.3  Device Status */
134 #define PCI_STATUS              0x06    /* 16 bits */
135 #define PCI_STATUS_CAP_LIST    0x10    /* Support Capability List */
136 
137 /* PCI30 6.7  Capabilities List */
138 #define PCI_CAPABILITY_LIST     0x34    /* Offset of first capability list entry */
139 #define PCI_CAP_FLAGS           2       /* Capability defined flags (16 bits) */
140 
141 /* PM12 3.2.1  Capability Identifier */
142 #define PCI_CAP_ID_PM           0x01    /* Power Management */
143 /* PCI30 H Capability IDs */
144 #define PCI_CAP_ID_EXP          0x10    /* PCI Express */
145 /* ECN_AF 6.x.1.1  Capability ID for AF */
146 #define PCI_CAP_ID_AF           0x13    /* Advanced Features */
147 
148 /* PCIe20 7.8.3  Device Capabilities Register (Offset 04h) */
149 #define PCI_EXP_DEVCAP          0x4     /* Device capabilities */
150 #define PCI_EXP_DEVCAP_FLR     (1<<28)  /* Function Level Reset */
151 #define PCI_EXP_LNKCAP          0xc     /* Link Capabilities */
152 #define PCI_EXP_LNKCAP_SPEED    0x0000f /* Maximum Link Speed */
153 #define PCI_EXP_LNKCAP_WIDTH    0x003f0 /* Maximum Link Width */
154 #define PCI_EXP_LNKSTA          0x12    /* Link Status */
155 #define PCI_EXP_LNKSTA_SPEED    0x000f  /* Negotiated Link Speed */
156 #define PCI_EXP_LNKSTA_WIDTH    0x03f0  /* Negotiated Link Width */
157 
158 /* Header type 1 BR12 3.2 PCI-to-PCI Bridge Configuration Space Header Format */
159 #define PCI_PRIMARY_BUS         0x18    /* BR12 3.2.5.2 Primary bus number */
160 #define PCI_SECONDARY_BUS       0x19    /* BR12 3.2.5.3 Secondary bus number */
161 #define PCI_SUBORDINATE_BUS     0x1a    /* BR12 3.2.5.4 Highest bus number behind the bridge */
162 #define PCI_BRIDGE_CONTROL      0x3e
163 /* BR12 3.2.5.18  Bridge Control Register */
164 #define PCI_BRIDGE_CTL_RESET   0x40    /* Secondary bus reset */
165 
166 /* PM12 3.2.4  Power Management Control/Status (Offset = 4) */
167 #define PCI_PM_CTRL                4    /* PM control and status register */
168 #define PCI_PM_CTRL_STATE_MASK    0x3  /* Current power state (D0 to D3) */
169 #define PCI_PM_CTRL_STATE_D0      0x0  /* D0 state */
170 #define PCI_PM_CTRL_STATE_D3hot   0x3  /* D3 state */
171 #define PCI_PM_CTRL_NO_SOFT_RESET 0x8  /* No reset for D3hot->D0 */
172 
173 /* ECN_AF 6.x.1  Advanced Features Capability Structure */
174 #define PCI_AF_CAP              0x3     /* Advanced features capabilities */
175 #define PCI_AF_CAP_FLR         0x2     /* Function Level Reset */
176 
177 #define PCI_EXP_FLAGS           0x2
178 #define PCI_EXP_FLAGS_TYPE      0x00f0
179 #define PCI_EXP_TYPE_DOWNSTREAM 0x6
180 
181 #define PCI_EXT_CAP_BASE          0x100
182 #define PCI_EXT_CAP_LIMIT         0x1000
183 #define PCI_EXT_CAP_ID_MASK       0x0000ffff
184 #define PCI_EXT_CAP_OFFSET_SHIFT  20
185 #define PCI_EXT_CAP_OFFSET_MASK   0x00000ffc
186 
187 #define PCI_EXT_CAP_ID_ACS      0x000d
188 #define PCI_EXT_ACS_CTRL        0x06
189 
190 #define PCI_EXT_CAP_ACS_SV      0x01
191 #define PCI_EXT_CAP_ACS_RR      0x04
192 #define PCI_EXT_CAP_ACS_CR      0x08
193 #define PCI_EXT_CAP_ACS_UF      0x10
194 #define PCI_EXT_CAP_ACS_ENABLED (PCI_EXT_CAP_ACS_SV | \
195                                  PCI_EXT_CAP_ACS_RR | \
196                                  PCI_EXT_CAP_ACS_CR | \
197                                  PCI_EXT_CAP_ACS_UF)
198 
199 #define PCI_EXP_TYPE_ROOT_INT_EP 0x9    /* Root Complex Integrated Endpoint */
200 #define PCI_EXP_TYPE_ROOT_EC 0xa        /* Root Complex Event Collector */
201 
202 static virClass *virPCIDeviceListClass;
203 
204 static void virPCIDeviceListDispose(void *obj);
205 
virPCIOnceInit(void)206 static int virPCIOnceInit(void)
207 {
208     if (!VIR_CLASS_NEW(virPCIDeviceList, virClassForObjectLockable()))
209         return -1;
210 
211     return 0;
212 }
213 
214 VIR_ONCE_GLOBAL_INIT(virPCI);
215 
216 
217 static char *
virPCIDriverDir(const char * driver)218 virPCIDriverDir(const char *driver)
219 {
220     return g_strdup_printf(PCI_SYSFS "drivers/%s", driver);
221 }
222 
223 
224 static char *
virPCIFile(const char * device,const char * file)225 virPCIFile(const char *device, const char *file)
226 {
227     return g_strdup_printf(PCI_SYSFS "devices/%s/%s", device, file);
228 }
229 
230 
231 /* virPCIDeviceGetDriverPathAndName - put the path to the driver
232  * directory of the driver in use for this device in @path and the
233  * name of the driver in @name. Both could be NULL if it's not bound
234  * to any driver.
235  *
236  * Return 0 for success, -1 for error.
237  */
238 int
virPCIDeviceGetDriverPathAndName(virPCIDevice * dev,char ** path,char ** name)239 virPCIDeviceGetDriverPathAndName(virPCIDevice *dev, char **path, char **name)
240 {
241     int ret = -1;
242     g_autofree char *drvlink = NULL;
243 
244     *path = *name = NULL;
245 
246     /* drvlink = "/sys/bus/pci/dddd:bb:ss.ff/driver" */
247     drvlink = virPCIFile(dev->name, "driver");
248 
249     if (!virFileExists(drvlink)) {
250         ret = 0;
251         goto cleanup;
252     }
253 
254     if (virFileIsLink(drvlink) != 1) {
255         virReportError(VIR_ERR_INTERNAL_ERROR,
256                        _("Invalid device %s driver file %s is not a symlink"),
257                        dev->name, drvlink);
258         goto cleanup;
259     }
260     if (virFileResolveLink(drvlink, path) < 0) {
261         virReportError(VIR_ERR_INTERNAL_ERROR,
262                        _("Unable to resolve device %s driver symlink %s"),
263                        dev->name, drvlink);
264         goto cleanup;
265     }
266     /* path = "/sys/bus/pci/drivers/${drivername}" */
267 
268     *name = g_path_get_basename(*path);
269     /* name = "${drivername}" */
270 
271     ret = 0;
272  cleanup:
273     if (ret < 0) {
274         VIR_FREE(*path);
275         VIR_FREE(*name);
276     }
277     return ret;
278 }
279 
280 
281 static int
virPCIDeviceConfigOpenInternal(virPCIDevice * dev,bool readonly,bool fatal)282 virPCIDeviceConfigOpenInternal(virPCIDevice *dev, bool readonly, bool fatal)
283 {
284     int fd;
285 
286     fd = open(dev->path, readonly ? O_RDONLY : O_RDWR);
287 
288     if (fd < 0) {
289         if (fatal) {
290             virReportSystemError(errno,
291                                  _("Failed to open config space file '%s'"),
292                                  dev->path);
293         } else {
294             VIR_WARN("Failed to open config space file '%s': %s",
295                      dev->path, g_strerror(errno));
296         }
297         return -1;
298     }
299 
300     VIR_DEBUG("%s %s: opened %s", dev->id, dev->name, dev->path);
301     return fd;
302 }
303 
304 static int
virPCIDeviceConfigOpen(virPCIDevice * dev)305 virPCIDeviceConfigOpen(virPCIDevice *dev)
306 {
307     return virPCIDeviceConfigOpenInternal(dev, true, true);
308 }
309 
310 static int
virPCIDeviceConfigOpenTry(virPCIDevice * dev)311 virPCIDeviceConfigOpenTry(virPCIDevice *dev)
312 {
313     return virPCIDeviceConfigOpenInternal(dev, true, false);
314 }
315 
316 static int
virPCIDeviceConfigOpenWrite(virPCIDevice * dev)317 virPCIDeviceConfigOpenWrite(virPCIDevice *dev)
318 {
319     return virPCIDeviceConfigOpenInternal(dev, false, true);
320 }
321 
322 static void
virPCIDeviceConfigClose(virPCIDevice * dev,int cfgfd)323 virPCIDeviceConfigClose(virPCIDevice *dev, int cfgfd)
324 {
325     if (VIR_CLOSE(cfgfd) < 0) {
326         VIR_WARN("Failed to close config space file '%s': %s",
327                  dev->path, g_strerror(errno));
328     }
329 }
330 
331 
332 static int
virPCIDeviceRead(virPCIDevice * dev,int cfgfd,unsigned int pos,uint8_t * buf,unsigned int buflen)333 virPCIDeviceRead(virPCIDevice *dev,
334                  int cfgfd,
335                  unsigned int pos,
336                  uint8_t *buf,
337                  unsigned int buflen)
338 {
339     memset(buf, 0, buflen);
340     errno = 0;
341 
342     if (lseek(cfgfd, pos, SEEK_SET) != pos ||
343         saferead(cfgfd, buf, buflen) != buflen) {
344         VIR_DEBUG("Failed to read %u bytes at %u from '%s' : %s",
345                  buflen, pos, dev->path, g_strerror(errno));
346         return -1;
347     }
348     return 0;
349 }
350 
351 
352 /**
353  * virPCIDeviceReadN:
354  * @dev: virPCIDevice object (used only to log name of config file)
355  * @cfgfd: open file descriptor for device config file in sysfs
356  * @pos: byte offset in the file to read from
357  *
358  * read "N" (where "N" is "8", "16", or "32", and appears at the end
359  * of the function name) bytes from a PCI device's already-opened
360  * sysfs config file and return them as the return value from the
361  * function.
362  *
363  * Returns the value at @pos in the file, or 0 if there was an
364  * error. NB: since 0 could be a valid value, occurrence of an error
365  * must be determined by examining errno. errno is always reset to 0
366  * before the seek/read is attempted (see virPCIDeviceRead()), so if
367  * errno != 0 on return from one of these functions, then either the
368  * seek or the read operation failed for some reason. If errno == 0
369  * and the return value is 0, then the config file really does contain
370  * the value 0 at @pos.
371  */
372 static uint8_t
virPCIDeviceRead8(virPCIDevice * dev,int cfgfd,unsigned int pos)373 virPCIDeviceRead8(virPCIDevice *dev, int cfgfd, unsigned int pos)
374 {
375     uint8_t buf;
376     virPCIDeviceRead(dev, cfgfd, pos, &buf, sizeof(buf));
377     return buf;
378 }
379 
380 static uint16_t
virPCIDeviceRead16(virPCIDevice * dev,int cfgfd,unsigned int pos)381 virPCIDeviceRead16(virPCIDevice *dev, int cfgfd, unsigned int pos)
382 {
383     uint8_t buf[2];
384     virPCIDeviceRead(dev, cfgfd, pos, &buf[0], sizeof(buf));
385     return (buf[0] << 0) | (buf[1] << 8);
386 }
387 
388 static uint32_t
virPCIDeviceRead32(virPCIDevice * dev,int cfgfd,unsigned int pos)389 virPCIDeviceRead32(virPCIDevice *dev, int cfgfd, unsigned int pos)
390 {
391     uint8_t buf[4];
392     virPCIDeviceRead(dev, cfgfd, pos, &buf[0], sizeof(buf));
393     return (buf[0] << 0) | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
394 }
395 
396 static int
virPCIDeviceReadClass(virPCIDevice * dev,uint16_t * device_class)397 virPCIDeviceReadClass(virPCIDevice *dev, uint16_t *device_class)
398 {
399     g_autofree char *path = NULL;
400     g_autofree char *id_str = NULL;
401     unsigned int value;
402 
403     path = virPCIFile(dev->name, "class");
404 
405     /* class string is '0xNNNNNN\n' ... i.e. 9 bytes */
406     if (virFileReadAll(path, 9, &id_str) < 0)
407         return -1;
408 
409     id_str[8] = '\0';
410     if (virStrToLong_ui(id_str, NULL, 16, &value) < 0) {
411         virReportError(VIR_ERR_INTERNAL_ERROR,
412                        _("Unusual value in %s/devices/%s/class: %s"),
413                        PCI_SYSFS, dev->name, id_str);
414         return -1;
415     }
416 
417     *device_class = (value >> 8) & 0xFFFF;
418     return 0;
419 }
420 
421 static int
virPCIDeviceWrite(virPCIDevice * dev,int cfgfd,unsigned int pos,uint8_t * buf,unsigned int buflen)422 virPCIDeviceWrite(virPCIDevice *dev,
423                   int cfgfd,
424                   unsigned int pos,
425                   uint8_t *buf,
426                   unsigned int buflen)
427 {
428     if (lseek(cfgfd, pos, SEEK_SET) != pos ||
429         safewrite(cfgfd, buf, buflen) != buflen) {
430         VIR_WARN("Failed to write to '%s' : %s", dev->path,
431                  g_strerror(errno));
432         return -1;
433     }
434     return 0;
435 }
436 
437 static void
virPCIDeviceWrite16(virPCIDevice * dev,int cfgfd,unsigned int pos,uint16_t val)438 virPCIDeviceWrite16(virPCIDevice *dev, int cfgfd, unsigned int pos, uint16_t val)
439 {
440     uint8_t buf[2] = { (val >> 0), (val >> 8) };
441     virPCIDeviceWrite(dev, cfgfd, pos, &buf[0], sizeof(buf));
442 }
443 
444 static void
virPCIDeviceWrite32(virPCIDevice * dev,int cfgfd,unsigned int pos,uint32_t val)445 virPCIDeviceWrite32(virPCIDevice *dev, int cfgfd, unsigned int pos, uint32_t val)
446 {
447     uint8_t buf[4] = { (val >> 0), (val >> 8), (val >> 16), (val >> 24) };
448     virPCIDeviceWrite(dev, cfgfd, pos, &buf[0], sizeof(buf));
449 }
450 
451 typedef int (*virPCIDeviceIterPredicate)(virPCIDevice *, virPCIDevice *,
452                                          void *);
453 
454 /* Iterate over available PCI devices calling @predicate
455  * to compare each one to @dev.
456  * Return -1 on error since we don't want to assume it is
457  * safe to reset if there is an error.
458  */
459 static int
virPCIDeviceIterDevices(virPCIDeviceIterPredicate predicate,virPCIDevice * dev,virPCIDevice ** matched,void * data)460 virPCIDeviceIterDevices(virPCIDeviceIterPredicate predicate,
461                         virPCIDevice *dev,
462                         virPCIDevice **matched,
463                         void *data)
464 {
465     g_autoptr(DIR) dir = NULL;
466     struct dirent *entry;
467     int ret = 0;
468     int rc;
469 
470     *matched = NULL;
471 
472     VIR_DEBUG("%s %s: iterating over " PCI_SYSFS "devices", dev->id, dev->name);
473 
474     if (virDirOpen(&dir, PCI_SYSFS "devices") < 0)
475         return -1;
476 
477     while ((ret = virDirRead(dir, &entry, PCI_SYSFS "devices")) > 0) {
478         g_autoptr(virPCIDevice) check = NULL;
479         virPCIDeviceAddress devAddr;
480         char *tmp;
481 
482         /* expected format: <domain>:<bus>:<slot>.<function> */
483         if (/* domain */
484             virStrToLong_ui(entry->d_name, &tmp, 16, &devAddr.domain) < 0 || *tmp != ':' ||
485             /* bus */
486             virStrToLong_ui(tmp + 1, &tmp, 16, &devAddr.bus) < 0 || *tmp != ':' ||
487             /* slot */
488             virStrToLong_ui(tmp + 1, &tmp, 16, &devAddr.slot) < 0 || *tmp != '.' ||
489             /* function */
490             virStrToLong_ui(tmp + 1, NULL, 16, &devAddr.function) < 0) {
491             VIR_WARN("Unusual entry in " PCI_SYSFS "devices: %s", entry->d_name);
492             continue;
493         }
494 
495         check = virPCIDeviceNew(&devAddr);
496         if (!check) {
497             ret = -1;
498             break;
499         }
500 
501         rc = predicate(dev, check, data);
502         if (rc < 0) {
503             /* the predicate returned an error, bail */
504             ret = -1;
505             break;
506         } else if (rc == 1) {
507             VIR_DEBUG("%s %s: iter matched on %s", dev->id, dev->name, check->name);
508             *matched = g_steal_pointer(&check);
509             ret = 1;
510             break;
511         }
512     }
513     return ret;
514 }
515 
516 
517 /**
518  * virPCIDeviceFindCapabilityOffset:
519  * @dev: virPCIDevice object (used only to log name of config file)
520  * @cfgfd: open file descriptor for device config file in sysfs
521  * @capability: PCI_CAP_ID_* being requested
522  * @offset: used to return the offset of @capability in the file
523  *
524  * Find the offset of @capability within the PCI config file @cfgfd of
525  * the device @dev. if found, the offset is returned in @offset,
526  * otherwise @offset is set to 0.
527  *
528  * Returns 0 on success, -1 on failure.
529  */
530 static int
virPCIDeviceFindCapabilityOffset(virPCIDevice * dev,int cfgfd,unsigned int capability,unsigned int * offset)531 virPCIDeviceFindCapabilityOffset(virPCIDevice *dev,
532                                  int cfgfd,
533                                  unsigned int capability,
534                                  unsigned int *offset)
535 {
536     uint16_t status;
537     uint8_t pos;
538 
539     *offset = 0; /* assume failure (*nothing* can be at offset 0) */
540 
541     status = virPCIDeviceRead16(dev, cfgfd, PCI_STATUS);
542     if (errno != 0 || !(status & PCI_STATUS_CAP_LIST))
543         goto error;
544 
545     pos = virPCIDeviceRead8(dev, cfgfd, PCI_CAPABILITY_LIST);
546     if (errno != 0)
547         goto error;
548 
549     /* Zero indicates last capability, capabilities can't
550      * be in the config space header and 0xff is returned
551      * by the kernel if we don't have access to this region
552      *
553      * Note: we're not handling loops or extended
554      * capabilities here.
555      */
556     while (pos >= PCI_CONF_HEADER_LEN && pos != 0xff) {
557         uint8_t capid = virPCIDeviceRead8(dev, cfgfd, pos);
558         if (errno != 0)
559             goto error;
560 
561         if (capid == capability) {
562             VIR_DEBUG("%s %s: found cap 0x%.2x at 0x%.2x",
563                       dev->id, dev->name, capability, pos);
564             *offset = pos;
565             return 0;
566         }
567 
568         pos = virPCIDeviceRead8(dev, cfgfd, pos + 1);
569         if (errno != 0)
570             goto error;
571     }
572 
573  error:
574     VIR_DEBUG("%s %s: failed to find cap 0x%.2x (%s)",
575               dev->id, dev->name, capability, g_strerror(errno));
576 
577     /* reset errno in case the failure was due to insufficient
578      * privileges to read the entire PCI config file
579      */
580     errno = 0;
581 
582     return -1;
583 }
584 
585 static unsigned int
virPCIDeviceFindExtendedCapabilityOffset(virPCIDevice * dev,int cfgfd,unsigned int capability)586 virPCIDeviceFindExtendedCapabilityOffset(virPCIDevice *dev,
587                                          int cfgfd,
588                                          unsigned int capability)
589 {
590     int ttl;
591     unsigned int pos;
592     uint32_t header;
593 
594     /* minimum 8 bytes per capability */
595     ttl = (PCI_EXT_CAP_LIMIT - PCI_EXT_CAP_BASE) / 8;
596     pos = PCI_EXT_CAP_BASE;
597 
598     while (ttl > 0 && pos >= PCI_EXT_CAP_BASE) {
599         header = virPCIDeviceRead32(dev, cfgfd, pos);
600 
601         if ((header & PCI_EXT_CAP_ID_MASK) == capability)
602             return pos;
603 
604         pos = (header >> PCI_EXT_CAP_OFFSET_SHIFT) & PCI_EXT_CAP_OFFSET_MASK;
605         ttl--;
606     }
607 
608     return 0;
609 }
610 
611 /* detects whether this device has FLR.  Returns 0 if the device does
612  * not have FLR, 1 if it does, and -1 on error
613  */
614 static bool
virPCIDeviceDetectFunctionLevelReset(virPCIDevice * dev,int cfgfd)615 virPCIDeviceDetectFunctionLevelReset(virPCIDevice *dev, int cfgfd)
616 {
617     uint32_t caps;
618     unsigned int pos;
619     g_autofree char *path = NULL;
620     int found;
621 
622     /* The PCIe Function Level Reset capability allows
623      * individual device functions to be reset without
624      * affecting any other functions on the device or
625      * any other devices on the bus. This is only common
626      * on SR-IOV NICs at the moment.
627      */
628     if (dev->pcie_cap_pos) {
629         caps = virPCIDeviceRead32(dev, cfgfd, dev->pcie_cap_pos + PCI_EXP_DEVCAP);
630         if (caps & PCI_EXP_DEVCAP_FLR) {
631             VIR_DEBUG("%s %s: detected PCIe FLR capability", dev->id, dev->name);
632             return true;
633         }
634     }
635 
636     /* The PCI AF Function Level Reset capability is
637      * the same thing, except for conventional PCI
638      * devices. This is not common yet.
639      */
640     if (virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_AF, &pos) < 0)
641         goto error;
642 
643     if (pos) {
644         caps = virPCIDeviceRead16(dev, cfgfd, pos + PCI_AF_CAP);
645         if (caps & PCI_AF_CAP_FLR) {
646             VIR_DEBUG("%s %s: detected PCI FLR capability", dev->id, dev->name);
647             return true;
648         }
649     }
650 
651     /* there are some buggy devices that do support FLR, but forget to
652      * advertise that fact in their capabilities.  However, FLR is *required*
653      * to be present for virtual functions (VFs), so if we see that this
654      * device is a VF, we just assume FLR works
655      */
656 
657     path = g_strdup_printf(PCI_SYSFS "devices/%s/physfn", dev->name);
658 
659     found = virFileExists(path);
660     if (found) {
661         VIR_DEBUG("%s %s: buggy device didn't advertise FLR, but is a VF; forcing flr on",
662                   dev->id, dev->name);
663         return true;
664     }
665 
666  error:
667     VIR_DEBUG("%s %s: no FLR capability found", dev->id, dev->name);
668     return false;
669 }
670 
671 /* Require the device has the PCI Power Management capability
672  * and that a D3hot->D0 transition will results in a full
673  * internal reset, not just a soft reset.
674  */
675 static bool
virPCIDeviceDetectPowerManagementReset(virPCIDevice * dev,int cfgfd)676 virPCIDeviceDetectPowerManagementReset(virPCIDevice *dev, int cfgfd)
677 {
678     if (dev->pci_pm_cap_pos) {
679         uint32_t ctl;
680 
681         /* require the NO_SOFT_RESET bit is clear */
682         ctl = virPCIDeviceRead32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL);
683         if (!(ctl & PCI_PM_CTRL_NO_SOFT_RESET)) {
684             VIR_DEBUG("%s %s: detected PM reset capability", dev->id, dev->name);
685             return true;
686         }
687     }
688 
689     VIR_DEBUG("%s %s: no PM reset capability found", dev->id, dev->name);
690 
691     return false;
692 }
693 
694 /* Any active devices on the same domain/bus ? */
695 static int
virPCIDeviceSharesBusWithActive(virPCIDevice * dev,virPCIDevice * check,void * data)696 virPCIDeviceSharesBusWithActive(virPCIDevice *dev, virPCIDevice *check, void *data)
697 {
698     virPCIDeviceList *inactiveDevs = data;
699 
700     /* Different domain, different bus, or simply identical device */
701     if (dev->address.domain != check->address.domain ||
702         dev->address.bus != check->address.bus ||
703         (dev->address.slot == check->address.slot &&
704          dev->address.function == check->address.function))
705         return 0;
706 
707     /* same bus, but inactive, i.e. about to be assigned to guest */
708     if (inactiveDevs && virPCIDeviceListFind(inactiveDevs, &check->address))
709         return 0;
710 
711     return 1;
712 }
713 
714 static virPCIDevice *
virPCIDeviceBusContainsActiveDevices(virPCIDevice * dev,virPCIDeviceList * inactiveDevs)715 virPCIDeviceBusContainsActiveDevices(virPCIDevice *dev,
716                                      virPCIDeviceList *inactiveDevs)
717 {
718     virPCIDevice *active = NULL;
719     if (virPCIDeviceIterDevices(virPCIDeviceSharesBusWithActive,
720                                 dev, &active, inactiveDevs) < 0)
721         return NULL;
722     return active;
723 }
724 
725 /* Is @check the parent of @dev ? */
726 static int
virPCIDeviceIsParent(virPCIDevice * dev,virPCIDevice * check,void * data)727 virPCIDeviceIsParent(virPCIDevice *dev, virPCIDevice *check, void *data)
728 {
729     uint16_t device_class;
730     uint8_t header_type, secondary, subordinate;
731     virPCIDevice **best = data;
732     int ret = 0;
733     int fd;
734 
735     if (dev->address.domain != check->address.domain)
736         return 0;
737 
738     if ((fd = virPCIDeviceConfigOpenTry(check)) < 0)
739         return 0;
740 
741     /* Is it a bridge? */
742     ret = virPCIDeviceReadClass(check, &device_class);
743     if (ret < 0 || device_class != PCI_CLASS_BRIDGE_PCI)
744         goto cleanup;
745 
746     /* Is it a plane? */
747     header_type = virPCIDeviceRead8(check, fd, PCI_HEADER_TYPE);
748     if ((header_type & PCI_HEADER_TYPE_MASK) != PCI_HEADER_TYPE_BRIDGE)
749         goto cleanup;
750 
751     secondary   = virPCIDeviceRead8(check, fd, PCI_SECONDARY_BUS);
752     subordinate = virPCIDeviceRead8(check, fd, PCI_SUBORDINATE_BUS);
753 
754     VIR_DEBUG("%s %s: found parent device %s", dev->id, dev->name, check->name);
755 
756     /* if the secondary bus exactly equals the device's bus, then we found
757      * the direct parent.  No further work is necessary
758      */
759     if (dev->address.bus == secondary) {
760         ret = 1;
761         goto cleanup;
762     }
763 
764     /* otherwise, SRIOV allows VFs to be on different buses than their PFs.
765      * In this case, what we need to do is look for the "best" match; i.e.
766      * the most restrictive match that still satisfies all of the conditions.
767      */
768     if (dev->address.bus > secondary && dev->address.bus <= subordinate) {
769         if (*best == NULL) {
770             *best = virPCIDeviceNew(&check->address);
771             if (*best == NULL) {
772                 ret = -1;
773                 goto cleanup;
774             }
775         } else {
776             /* OK, we had already recorded a previous "best" match for the
777              * parent.  See if the current device is more restrictive than the
778              * best, and if so, make it the new best
779              */
780             int bestfd;
781             uint8_t best_secondary;
782 
783             if ((bestfd = virPCIDeviceConfigOpenTry(*best)) < 0)
784                 goto cleanup;
785             best_secondary = virPCIDeviceRead8(*best, bestfd, PCI_SECONDARY_BUS);
786             virPCIDeviceConfigClose(*best, bestfd);
787 
788             if (secondary > best_secondary) {
789                 virPCIDeviceFree(*best);
790                 *best = virPCIDeviceNew(&check->address);
791                 if (*best == NULL) {
792                     ret = -1;
793                     goto cleanup;
794                 }
795             }
796         }
797     }
798 
799  cleanup:
800     virPCIDeviceConfigClose(check, fd);
801     return ret;
802 }
803 
804 static int
virPCIDeviceGetParent(virPCIDevice * dev,virPCIDevice ** parent)805 virPCIDeviceGetParent(virPCIDevice *dev, virPCIDevice **parent)
806 {
807     virPCIDevice *best = NULL;
808     int ret;
809 
810     *parent = NULL;
811     ret = virPCIDeviceIterDevices(virPCIDeviceIsParent, dev, parent, &best);
812     if (ret == 1)
813         virPCIDeviceFree(best);
814     else if (ret == 0)
815         *parent = best;
816     return ret;
817 }
818 
819 /* Secondary Bus Reset is our sledgehammer - it resets all
820  * devices behind a bus.
821  */
822 static int
virPCIDeviceTrySecondaryBusReset(virPCIDevice * dev,int cfgfd,virPCIDeviceList * inactiveDevs)823 virPCIDeviceTrySecondaryBusReset(virPCIDevice *dev,
824                                  int cfgfd,
825                                  virPCIDeviceList *inactiveDevs)
826 {
827     g_autoptr(virPCIDevice) parent = NULL;
828     g_autoptr(virPCIDevice) conflict = NULL;
829     uint8_t config_space[PCI_CONF_LEN];
830     uint16_t ctl;
831     int ret = -1;
832     int parentfd;
833 
834     /* Refuse to do a secondary bus reset if there are other
835      * devices/functions behind the bus are used by the host
836      * or other guests.
837      */
838     if ((conflict = virPCIDeviceBusContainsActiveDevices(dev, inactiveDevs))) {
839         virReportError(VIR_ERR_INTERNAL_ERROR,
840                        _("Active %s devices on bus with %s, not doing bus reset"),
841                        conflict->name, dev->name);
842         return -1;
843     }
844 
845     /* Find the parent bus */
846     if (virPCIDeviceGetParent(dev, &parent) < 0)
847         return -1;
848     if (!parent) {
849         virReportError(VIR_ERR_INTERNAL_ERROR,
850                        _("Failed to find parent device for %s"),
851                        dev->name);
852         return -1;
853     }
854     if ((parentfd = virPCIDeviceConfigOpenWrite(parent)) < 0)
855         goto out;
856 
857     VIR_DEBUG("%s %s: doing a secondary bus reset", dev->id, dev->name);
858 
859     /* Save and restore the device's config space; we only do this
860      * for the supplied device since we refuse to do a reset if there
861      * are multiple devices/functions
862      */
863     if (virPCIDeviceRead(dev, cfgfd, 0, config_space, PCI_CONF_LEN) < 0) {
864         virReportError(VIR_ERR_INTERNAL_ERROR,
865                        _("Failed to read PCI config space for %s"),
866                        dev->name);
867         goto out;
868     }
869 
870     /* Read the control register, set the reset flag, wait 200ms,
871      * unset the reset flag and wait 200ms.
872      */
873     ctl = virPCIDeviceRead16(dev, parentfd, PCI_BRIDGE_CONTROL);
874 
875     virPCIDeviceWrite16(parent, parentfd, PCI_BRIDGE_CONTROL,
876                         ctl | PCI_BRIDGE_CTL_RESET);
877 
878     g_usleep(200 * 1000); /* sleep 200ms */
879 
880     virPCIDeviceWrite16(parent, parentfd, PCI_BRIDGE_CONTROL, ctl);
881 
882     g_usleep(200 * 1000); /* sleep 200ms */
883 
884     if (virPCIDeviceWrite(dev, cfgfd, 0, config_space, PCI_CONF_LEN) < 0) {
885         virReportError(VIR_ERR_INTERNAL_ERROR,
886                        _("Failed to restore PCI config space for %s"),
887                        dev->name);
888         goto out;
889     }
890     ret = 0;
891 
892  out:
893     virPCIDeviceConfigClose(parent, parentfd);
894     return ret;
895 }
896 
897 /* Power management reset attempts to reset a device using a
898  * D-state transition from D3hot to D0. Note, in detect_pm_reset()
899  * above we require the device supports a full internal reset.
900  */
901 static int
virPCIDeviceTryPowerManagementReset(virPCIDevice * dev,int cfgfd)902 virPCIDeviceTryPowerManagementReset(virPCIDevice *dev, int cfgfd)
903 {
904     uint8_t config_space[PCI_CONF_LEN];
905     uint32_t ctl;
906 
907     if (!dev->pci_pm_cap_pos)
908         return -1;
909 
910     /* Save and restore the device's config space. */
911     if (virPCIDeviceRead(dev, cfgfd, 0, &config_space[0], PCI_CONF_LEN) < 0) {
912         virReportError(VIR_ERR_INTERNAL_ERROR,
913                        _("Failed to read PCI config space for %s"),
914                        dev->name);
915         return -1;
916     }
917 
918     VIR_DEBUG("%s %s: doing a power management reset", dev->id, dev->name);
919 
920     ctl = virPCIDeviceRead32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL);
921     ctl &= ~PCI_PM_CTRL_STATE_MASK;
922 
923     virPCIDeviceWrite32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL,
924                         ctl | PCI_PM_CTRL_STATE_D3hot);
925 
926     g_usleep(10 * 1000); /* sleep 10ms */
927 
928     virPCIDeviceWrite32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL,
929                         ctl | PCI_PM_CTRL_STATE_D0);
930 
931     g_usleep(10 * 1000); /* sleep 10ms */
932 
933     if (virPCIDeviceWrite(dev, cfgfd, 0, &config_space[0], PCI_CONF_LEN) < 0) {
934         virReportError(VIR_ERR_INTERNAL_ERROR,
935                        _("Failed to restore PCI config space for %s"),
936                        dev->name);
937         return -1;
938     }
939 
940     return 0;
941 }
942 
943 /**
944  * virPCIDeviceInit:
945  * @dev: virPCIDevice object needing its PCI capabilities info initialized
946  * @cfgfd: open file descriptor for device config file in sysfs
947  *
948  * Initialize the PCI capabilities attributes of a virPCIDevice object
949  * (i.e. pcie_cap_pos, pci_pm_cap_pos, has_flr, has_pm_reset, and
950  * is_pcie). This is done by walking the info in the (already-opened)
951  * device PCI config file in sysfs. This function can be called
952  * regardless of whether a process has sufficient privilege to read
953  * the entire file (unprivileged processes can only read the 1st 64
954  * bytes, while the Express Capabilities are all located beyond that
955  * boundary).
956  *
957  * In the case that we are unable to read a capability
958  * directly, we will attempt to infer its value by other means. In
959  * particular, we can determine that a device is (almost surely) PCIe
960  * by checking that the length of the config file is != 256 (since all
961  * conventional PCI config files are 256 bytes), and we know that any
962  * device that is an SR-IOV VF will have FLR available (since that is
963  * required by the SR-IOV spec.)
964  *
965  * Always returns success (0) (for now)
966  */
967 static int
virPCIDeviceInit(virPCIDevice * dev,int cfgfd)968 virPCIDeviceInit(virPCIDevice *dev, int cfgfd)
969 {
970     dev->is_pcie = false;
971     if (virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_EXP, &dev->pcie_cap_pos) < 0) {
972         /* an unprivileged process is unable to read *all* of a
973          * device's PCI config (it can only read the first 64
974          * bytes, which isn't enough for see the Express
975          * Capabilities data). If virPCIDeviceFindCapabilityOffset
976          * returns failure (and not just a pcie_cap_pos == 0,
977          * which is *success* at determining the device is *not*
978          * PCIe) we make an educated guess based on the length of
979          * the device's config file - if it is 256 bytes, then it
980          * is definitely a legacy PCI device. If it's larger than
981          * that, then it is *probably PCIe (although it could be
982          * PCI-x, but those are extremely rare). If the config
983          * file can't be found (in which case the "length" will be
984          * -1), then we blindly assume the most likely outcome -
985          * PCIe.
986          */
987         off_t configLen = virFileLength(virPCIDeviceGetConfigPath(dev), -1);
988 
989         if (configLen != 256)
990             dev->is_pcie = true;
991 
992     } else {
993         dev->is_pcie = (dev->pcie_cap_pos != 0);
994     }
995 
996     virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_PM, &dev->pci_pm_cap_pos);
997     dev->has_flr = virPCIDeviceDetectFunctionLevelReset(dev, cfgfd);
998     dev->has_pm_reset = virPCIDeviceDetectPowerManagementReset(dev, cfgfd);
999 
1000     return 0;
1001 }
1002 
1003 int
virPCIDeviceReset(virPCIDevice * dev,virPCIDeviceList * activeDevs,virPCIDeviceList * inactiveDevs)1004 virPCIDeviceReset(virPCIDevice *dev,
1005                   virPCIDeviceList *activeDevs,
1006                   virPCIDeviceList *inactiveDevs)
1007 {
1008     g_autofree char *drvPath = NULL;
1009     g_autofree char *drvName = NULL;
1010     int ret = -1;
1011     int fd = -1;
1012     int hdrType = -1;
1013 
1014     if (virPCIGetHeaderType(dev, &hdrType) < 0)
1015         return -1;
1016 
1017     if (hdrType != VIR_PCI_HEADER_ENDPOINT) {
1018         virReportError(VIR_ERR_INTERNAL_ERROR,
1019                        _("Invalid attempt to reset PCI device %s. "
1020                          "Only PCI endpoint devices can be reset"),
1021                        dev->name);
1022         return -1;
1023     }
1024 
1025     if (activeDevs && virPCIDeviceListFind(activeDevs, &dev->address)) {
1026         virReportError(VIR_ERR_INTERNAL_ERROR,
1027                        _("Not resetting active device %s"), dev->name);
1028         return -1;
1029     }
1030 
1031     /* If the device is currently bound to vfio-pci, ignore all
1032      * requests to reset it, since the vfio-pci driver will always
1033      * reset it whenever appropriate, so doing it ourselves would just
1034      * be redundant.
1035      */
1036     if (virPCIDeviceGetDriverPathAndName(dev, &drvPath, &drvName) < 0)
1037         goto cleanup;
1038 
1039     if (virPCIStubDriverTypeFromString(drvName) == VIR_PCI_STUB_DRIVER_VFIO) {
1040         VIR_DEBUG("Device %s is bound to vfio-pci - skip reset",
1041                   dev->name);
1042         ret = 0;
1043         goto cleanup;
1044     }
1045     VIR_DEBUG("Resetting device %s", dev->name);
1046 
1047     if ((fd = virPCIDeviceConfigOpenWrite(dev)) < 0)
1048         goto cleanup;
1049 
1050     if (virPCIDeviceInit(dev, fd) < 0)
1051         goto cleanup;
1052 
1053     /* KVM will perform FLR when starting and stopping
1054      * a guest, so there is no need for us to do it here.
1055      */
1056     if (dev->has_flr) {
1057         ret = 0;
1058         goto cleanup;
1059     }
1060 
1061     /* If the device supports PCI power management reset,
1062      * that's the next best thing because it only resets
1063      * the function, not the whole device.
1064      */
1065     if (dev->has_pm_reset)
1066         ret = virPCIDeviceTryPowerManagementReset(dev, fd);
1067 
1068     /* Bus reset is not an option with the root bus */
1069     if (ret < 0 && dev->address.bus != 0)
1070         ret = virPCIDeviceTrySecondaryBusReset(dev, fd, inactiveDevs);
1071 
1072     if (ret < 0) {
1073         virErrorPtr err = virGetLastError();
1074         virReportError(VIR_ERR_INTERNAL_ERROR,
1075                        _("Unable to reset PCI device %s: %s"),
1076                        dev->name,
1077                        err ? err->message :
1078                        _("no FLR, PM reset or bus reset available"));
1079     }
1080 
1081  cleanup:
1082     virPCIDeviceConfigClose(dev, fd);
1083     return ret;
1084 }
1085 
1086 
1087 static int
virPCIProbeStubDriver(virPCIStubDriver driver)1088 virPCIProbeStubDriver(virPCIStubDriver driver)
1089 {
1090     const char *drvname = NULL;
1091     g_autofree char *drvpath = NULL;
1092     g_autofree char *errbuf = NULL;
1093 
1094     if (driver == VIR_PCI_STUB_DRIVER_NONE ||
1095         !(drvname = virPCIStubDriverTypeToString(driver))) {
1096         virReportError(VIR_ERR_INTERNAL_ERROR,
1097                        "%s",
1098                        _("Attempting to use unknown stub driver"));
1099         return -1;
1100     }
1101 
1102     drvpath = virPCIDriverDir(drvname);
1103 
1104     /* driver previously loaded, return */
1105     if (virFileExists(drvpath))
1106         return 0;
1107 
1108     if ((errbuf = virKModLoad(drvname))) {
1109         VIR_WARN("failed to load driver %s: %s", drvname, errbuf);
1110         goto cleanup;
1111     }
1112 
1113     /* driver loaded after probing */
1114     if (virFileExists(drvpath))
1115         return 0;
1116 
1117  cleanup:
1118     /* If we know failure was because of admin config, let's report that;
1119      * otherwise, report a more generic failure message
1120      */
1121     if (virKModIsProhibited(drvname)) {
1122         virReportError(VIR_ERR_INTERNAL_ERROR,
1123                        _("Failed to load PCI stub module %s: "
1124                          "administratively prohibited"),
1125                        drvname);
1126     } else {
1127         virReportError(VIR_ERR_INTERNAL_ERROR,
1128                        _("Failed to load PCI stub module %s"),
1129                        drvname);
1130     }
1131 
1132     return -1;
1133 }
1134 
1135 int
virPCIDeviceUnbind(virPCIDevice * dev)1136 virPCIDeviceUnbind(virPCIDevice *dev)
1137 {
1138     g_autofree char *path = NULL;
1139     g_autofree char *drvpath = NULL;
1140     g_autofree char *driver = NULL;
1141 
1142     if (virPCIDeviceGetDriverPathAndName(dev, &drvpath, &driver) < 0)
1143         return -1;
1144 
1145     if (!driver)
1146         /* The device is not bound to any driver */
1147         return 0;
1148 
1149     path = virPCIFile(dev->name, "driver/unbind");
1150 
1151     if (virFileExists(path)) {
1152         if (virFileWriteStr(path, dev->name, 0) < 0) {
1153             virReportSystemError(errno,
1154                                  _("Failed to unbind PCI device '%s' from %s"),
1155                                  dev->name, driver);
1156             return -1;
1157         }
1158     }
1159 
1160     return 0;
1161 }
1162 
1163 
1164 /**
1165  * virPCIDeviceRebind:
1166  *  @dev: virPCIDevice object describing the device to rebind
1167  *
1168  * unbind a device from its driver, then immediately rebind it.
1169  *
1170  * Returns 0 on success, -1 on failure
1171  */
virPCIDeviceRebind(virPCIDevice * dev)1172 int virPCIDeviceRebind(virPCIDevice *dev)
1173 {
1174     if (virPCIDeviceUnbind(dev) < 0)
1175         return -1;
1176 
1177     if (virFileWriteStr(PCI_SYSFS "drivers_probe", dev->name, 0) < 0) {
1178         virReportSystemError(errno,
1179                              _("Failed to trigger a probe for PCI device '%s'"),
1180                              dev->name);
1181         return -1;
1182     }
1183 
1184     return 0;
1185 }
1186 
1187 
1188 /*
1189  * Bind a PCI device to a driver using driver_override sysfs interface.
1190  * E.g.
1191  *
1192  *  echo driver-name > /sys/bus/pci/devices/0000:03:00.0/driver_override
1193  *  echo 0000:03:00.0 > /sys/bus/pci/devices/0000:03:00.0/driver/unbind
1194  *  echo 0000:03:00.0 > /sys/bus/pci/drivers_probe
1195  *
1196  * An empty driverName will cause the device to be bound to its
1197  * preferred driver.
1198  */
1199 static int
virPCIDeviceBindWithDriverOverride(virPCIDevice * dev,const char * driverName)1200 virPCIDeviceBindWithDriverOverride(virPCIDevice *dev,
1201                                    const char *driverName)
1202 {
1203     g_autofree char *path = NULL;
1204 
1205     path = virPCIFile(dev->name, "driver_override");
1206 
1207     if (virFileWriteStr(path, driverName, 0) < 0) {
1208         virReportSystemError(errno,
1209                              _("Failed to add driver '%s' to driver_override "
1210                                " interface of PCI device '%s'"),
1211                              driverName, dev->name);
1212         return -1;
1213     }
1214 
1215     if (virPCIDeviceRebind(dev) < 0)
1216         return -1;
1217 
1218     return 0;
1219 }
1220 
1221 static int
virPCIDeviceUnbindFromStub(virPCIDevice * dev)1222 virPCIDeviceUnbindFromStub(virPCIDevice *dev)
1223 {
1224     if (!dev->unbind_from_stub) {
1225         VIR_DEBUG("Unbind from stub skipped for PCI device %s", dev->name);
1226         return 0;
1227     }
1228 
1229     return virPCIDeviceBindWithDriverOverride(dev, "\n");
1230 }
1231 
1232 static int
virPCIDeviceBindToStub(virPCIDevice * dev)1233 virPCIDeviceBindToStub(virPCIDevice *dev)
1234 {
1235     const char *stubDriverName;
1236     g_autofree char *stubDriverPath = NULL;
1237     g_autofree char *driverLink = NULL;
1238 
1239     /* Check the device is configured to use one of the known stub drivers */
1240     if (dev->stubDriver == VIR_PCI_STUB_DRIVER_NONE) {
1241         virReportError(VIR_ERR_INTERNAL_ERROR,
1242                        _("No stub driver configured for PCI device %s"),
1243                        dev->name);
1244         return -1;
1245     } else if (!(stubDriverName = virPCIStubDriverTypeToString(dev->stubDriver))) {
1246         virReportError(VIR_ERR_INTERNAL_ERROR,
1247                        _("Unknown stub driver configured for PCI device %s"),
1248                        dev->name);
1249         return -1;
1250     }
1251 
1252     stubDriverPath = virPCIDriverDir(stubDriverName);
1253     driverLink = virPCIFile(dev->name, "driver");
1254 
1255     if (virFileExists(driverLink)) {
1256         if (virFileLinkPointsTo(driverLink, stubDriverPath)) {
1257             /* The device is already bound to the correct driver */
1258             VIR_DEBUG("Device %s is already bound to %s",
1259                       dev->name, stubDriverName);
1260             return 0;
1261         }
1262     }
1263 
1264     if (virPCIDeviceBindWithDriverOverride(dev, stubDriverName) < 0)
1265         return -1;
1266 
1267     dev->unbind_from_stub = true;
1268     return 0;
1269 }
1270 
1271 /* virPCIDeviceDetach:
1272  *
1273  * Detach this device from the host driver, attach it to the stub
1274  * driver (previously set with virPCIDeviceSetStubDriver(), and add *a
1275  * copy* of the object to the inactiveDevs list (if provided). This
1276  * function will *never* consume dev, so the caller should free it.
1277  *
1278  * Returns 0 on success, -1 on failure (will fail if the device is
1279  * already in the activeDevs list, but will be a NOP if the device is
1280  * already bound to the stub).
1281  *
1282  * GENERAL NOTE: activeDevs should be a list of all PCI devices
1283  * currently in use by a domain. inactiveDevs is a list of all PCI
1284  * devices that libvirt has detached from the host driver + attached
1285  * to the stub driver, but hasn't yet assigned to a domain. Any device
1286  * that is still attached to its host driver should not be on either
1287  * list.
1288  */
1289 int
virPCIDeviceDetach(virPCIDevice * dev,virPCIDeviceList * activeDevs,virPCIDeviceList * inactiveDevs)1290 virPCIDeviceDetach(virPCIDevice *dev,
1291                    virPCIDeviceList *activeDevs,
1292                    virPCIDeviceList *inactiveDevs)
1293 {
1294     if (virPCIProbeStubDriver(dev->stubDriver) < 0)
1295         return -1;
1296 
1297     if (activeDevs && virPCIDeviceListFind(activeDevs, &dev->address)) {
1298         virReportError(VIR_ERR_INTERNAL_ERROR,
1299                        _("Not detaching active device %s"), dev->name);
1300         return -1;
1301     }
1302 
1303     if (virPCIDeviceBindToStub(dev) < 0)
1304         return -1;
1305 
1306     /* Add *a copy of* the dev into list inactiveDevs, if
1307      * it's not already there.
1308      */
1309     if (inactiveDevs && !virPCIDeviceListFind(inactiveDevs, &dev->address)) {
1310         VIR_DEBUG("Adding PCI device %s to inactive list", dev->name);
1311         if (virPCIDeviceListAddCopy(inactiveDevs, dev) < 0)
1312             return -1;
1313     }
1314 
1315     return 0;
1316 }
1317 
1318 /*
1319  * Pre-condition: inactivePCIHostdevs & activePCIHostdevs
1320  * are locked
1321  */
1322 int
virPCIDeviceReattach(virPCIDevice * dev,virPCIDeviceList * activeDevs,virPCIDeviceList * inactiveDevs)1323 virPCIDeviceReattach(virPCIDevice *dev,
1324                      virPCIDeviceList *activeDevs,
1325                      virPCIDeviceList *inactiveDevs)
1326 {
1327     if (activeDevs && virPCIDeviceListFind(activeDevs, &dev->address)) {
1328         virReportError(VIR_ERR_INTERNAL_ERROR,
1329                        _("Not reattaching active device %s"), dev->name);
1330         return -1;
1331     }
1332 
1333     if (virPCIDeviceUnbindFromStub(dev) < 0)
1334         return -1;
1335 
1336     /* Steal the dev from list inactiveDevs */
1337     if (inactiveDevs) {
1338         VIR_DEBUG("Removing PCI device %s from inactive list", dev->name);
1339         virPCIDeviceListDel(inactiveDevs, &dev->address);
1340     }
1341 
1342     return 0;
1343 }
1344 
1345 static char *
virPCIDeviceReadID(virPCIDevice * dev,const char * id_name)1346 virPCIDeviceReadID(virPCIDevice *dev, const char *id_name)
1347 {
1348     g_autofree char *path = NULL;
1349     g_autofree char *id_str = NULL;
1350 
1351     path = virPCIFile(dev->name, id_name);
1352 
1353     /* ID string is '0xNNNN\n' ... i.e. 7 bytes */
1354     if (virFileReadAll(path, 7, &id_str) < 0)
1355         return NULL;
1356 
1357     /* Check for 0x suffix */
1358     if (id_str[0] != '0' || id_str[1] != 'x')
1359         return NULL;
1360 
1361     /* Chop off the newline; we know the string is 7 bytes */
1362     id_str[6] = '\0';
1363 
1364     return g_steal_pointer(&id_str);
1365 }
1366 
1367 bool
virPCIDeviceAddressIsValid(virPCIDeviceAddress * addr,bool report)1368 virPCIDeviceAddressIsValid(virPCIDeviceAddress *addr,
1369                            bool report)
1370 {
1371     if (addr->bus > 0xFF) {
1372         if (report)
1373             virReportError(VIR_ERR_XML_ERROR,
1374                            _("Invalid PCI address bus='0x%x', "
1375                              "must be <= 0xFF"),
1376                            addr->bus);
1377         return false;
1378     }
1379     if (addr->slot > 0x1F) {
1380         if (report)
1381             virReportError(VIR_ERR_XML_ERROR,
1382                            _("Invalid PCI address slot='0x%x', "
1383                              "must be <= 0x1F"),
1384                            addr->slot);
1385         return false;
1386     }
1387     if (addr->function > 7) {
1388         if (report)
1389             virReportError(VIR_ERR_XML_ERROR,
1390                            _("Invalid PCI address function=0x%x, "
1391                              "must be <= 7"),
1392                            addr->function);
1393         return false;
1394     }
1395     if (virPCIDeviceAddressIsEmpty(addr)) {
1396         if (report)
1397             virReportError(VIR_ERR_XML_ERROR, "%s",
1398                            _("Invalid PCI address 0000:00:00, at least "
1399                              "one of domain, bus, or slot must be > 0"));
1400         return false;
1401     }
1402     return true;
1403 }
1404 
1405 bool
virPCIDeviceAddressIsEmpty(const virPCIDeviceAddress * addr)1406 virPCIDeviceAddressIsEmpty(const virPCIDeviceAddress *addr)
1407 {
1408     return !(addr->domain || addr->bus || addr->slot);
1409 }
1410 
1411 bool
virPCIDeviceAddressEqual(const virPCIDeviceAddress * addr1,const virPCIDeviceAddress * addr2)1412 virPCIDeviceAddressEqual(const virPCIDeviceAddress *addr1,
1413                          const virPCIDeviceAddress *addr2)
1414 {
1415     if (addr1->domain == addr2->domain &&
1416         addr1->bus == addr2->bus &&
1417         addr1->slot == addr2->slot &&
1418         addr1->function == addr2->function) {
1419         return true;
1420     }
1421     return false;
1422 }
1423 
1424 /**
1425  * virPCIDeviceAddressCopy:
1426  * @dst: where to store address
1427  * @src: source address to copy
1428  *
1429  * Creates a deep copy of given @src address and stores it into
1430  * @dst which has to be pre-allocated by caller.
1431  */
virPCIDeviceAddressCopy(virPCIDeviceAddress * dst,const virPCIDeviceAddress * src)1432 void virPCIDeviceAddressCopy(virPCIDeviceAddress *dst,
1433                              const virPCIDeviceAddress *src)
1434 {
1435     memcpy(dst, src, sizeof(*src));
1436 }
1437 
1438 char *
virPCIDeviceAddressAsString(const virPCIDeviceAddress * addr)1439 virPCIDeviceAddressAsString(const virPCIDeviceAddress *addr)
1440 {
1441     return g_strdup_printf(VIR_PCI_DEVICE_ADDRESS_FMT, addr->domain,
1442                            addr->bus, addr->slot, addr->function);
1443 }
1444 
1445 bool
virPCIDeviceExists(const virPCIDeviceAddress * addr)1446 virPCIDeviceExists(const virPCIDeviceAddress *addr)
1447 {
1448     g_autofree char *devName = virPCIDeviceAddressAsString(addr);
1449     g_autofree char *devPath = g_strdup_printf(PCI_SYSFS "devices/%s/config",
1450                                                devName);
1451 
1452     return virFileExists(devPath);
1453 }
1454 
1455 virPCIDevice *
virPCIDeviceNew(const virPCIDeviceAddress * address)1456 virPCIDeviceNew(const virPCIDeviceAddress *address)
1457 {
1458     g_autoptr(virPCIDevice) dev = NULL;
1459     g_autofree char *vendor = NULL;
1460     g_autofree char *product = NULL;
1461 
1462     dev = g_new0(virPCIDevice, 1);
1463 
1464     virPCIDeviceAddressCopy(&dev->address, address);
1465 
1466     dev->name = virPCIDeviceAddressAsString(&dev->address);
1467 
1468     dev->path = g_strdup_printf(PCI_SYSFS "devices/%s/config", dev->name);
1469 
1470     if (!virFileExists(dev->path)) {
1471         virReportSystemError(errno,
1472                              _("Device %s not found: could not access %s"),
1473                              dev->name, dev->path);
1474         return NULL;
1475     }
1476 
1477     vendor  = virPCIDeviceReadID(dev, "vendor");
1478     product = virPCIDeviceReadID(dev, "device");
1479 
1480     if (!vendor || !product) {
1481         virReportError(VIR_ERR_INTERNAL_ERROR,
1482                        _("Failed to read product/vendor ID for %s"),
1483                        dev->name);
1484         return NULL;
1485     }
1486 
1487     /* strings contain '0x' prefix */
1488     if (g_snprintf(dev->id, sizeof(dev->id), "%s %s", &vendor[2],
1489                    &product[2]) >= sizeof(dev->id)) {
1490         virReportError(VIR_ERR_INTERNAL_ERROR,
1491                        _("dev->id buffer overflow: %s %s"),
1492                        &vendor[2], &product[2]);
1493         return NULL;
1494     }
1495 
1496     VIR_DEBUG("%s %s: initialized", dev->id, dev->name);
1497 
1498     return g_steal_pointer(&dev);
1499 }
1500 
1501 
1502 virPCIDevice *
virPCIDeviceCopy(virPCIDevice * dev)1503 virPCIDeviceCopy(virPCIDevice *dev)
1504 {
1505     virPCIDevice *copy;
1506 
1507     copy = g_new0(virPCIDevice, 1);
1508 
1509     /* shallow copy to take care of most attributes */
1510     *copy = *dev;
1511     copy->path = NULL;
1512     copy->used_by_drvname = copy->used_by_domname = NULL;
1513     copy->name = g_strdup(dev->name);
1514     copy->path = g_strdup(dev->path);
1515     copy->used_by_drvname = g_strdup(dev->used_by_drvname);
1516     copy->used_by_domname = g_strdup(dev->used_by_domname);
1517     return copy;
1518 }
1519 
1520 
1521 void
virPCIDeviceFree(virPCIDevice * dev)1522 virPCIDeviceFree(virPCIDevice *dev)
1523 {
1524     if (!dev)
1525         return;
1526     VIR_DEBUG("%s %s: freeing", dev->id, dev->name);
1527     g_free(dev->name);
1528     g_free(dev->path);
1529     g_free(dev->used_by_drvname);
1530     g_free(dev->used_by_domname);
1531     g_free(dev);
1532 }
1533 
1534 /**
1535  * virPCIDeviceGetAddress:
1536  * @dev: device to get address from
1537  *
1538  * Take a PCI device on input and return its PCI address. The
1539  * returned object is owned by the device and must not be freed.
1540  *
1541  * Returns: a pointer to the address, which can never be NULL.
1542  */
1543 virPCIDeviceAddress *
virPCIDeviceGetAddress(virPCIDevice * dev)1544 virPCIDeviceGetAddress(virPCIDevice *dev)
1545 {
1546     return &(dev->address);
1547 }
1548 
1549 const char *
virPCIDeviceGetName(virPCIDevice * dev)1550 virPCIDeviceGetName(virPCIDevice *dev)
1551 {
1552     return dev->name;
1553 }
1554 
1555 /**
1556  * virPCIDeviceGetConfigPath:
1557  *
1558  * Returns a pointer to a string containing the path of @dev's PCI
1559  * config file.
1560  */
1561 const char *
virPCIDeviceGetConfigPath(virPCIDevice * dev)1562 virPCIDeviceGetConfigPath(virPCIDevice *dev)
1563 {
1564     return dev->path;
1565 }
1566 
virPCIDeviceSetManaged(virPCIDevice * dev,bool managed)1567 void virPCIDeviceSetManaged(virPCIDevice *dev, bool managed)
1568 {
1569     dev->managed = managed;
1570 }
1571 
1572 bool
virPCIDeviceGetManaged(virPCIDevice * dev)1573 virPCIDeviceGetManaged(virPCIDevice *dev)
1574 {
1575     return dev->managed;
1576 }
1577 
1578 void
virPCIDeviceSetStubDriver(virPCIDevice * dev,virPCIStubDriver driver)1579 virPCIDeviceSetStubDriver(virPCIDevice *dev, virPCIStubDriver driver)
1580 {
1581     dev->stubDriver = driver;
1582 }
1583 
1584 virPCIStubDriver
virPCIDeviceGetStubDriver(virPCIDevice * dev)1585 virPCIDeviceGetStubDriver(virPCIDevice *dev)
1586 {
1587     return dev->stubDriver;
1588 }
1589 
1590 bool
virPCIDeviceGetUnbindFromStub(virPCIDevice * dev)1591 virPCIDeviceGetUnbindFromStub(virPCIDevice *dev)
1592 {
1593     return dev->unbind_from_stub;
1594 }
1595 
1596 void
virPCIDeviceSetUnbindFromStub(virPCIDevice * dev,bool unbind)1597 virPCIDeviceSetUnbindFromStub(virPCIDevice *dev, bool unbind)
1598 {
1599     dev->unbind_from_stub = unbind;
1600 }
1601 
1602 bool
virPCIDeviceGetRemoveSlot(virPCIDevice * dev)1603 virPCIDeviceGetRemoveSlot(virPCIDevice *dev)
1604 {
1605     return dev->remove_slot;
1606 }
1607 
1608 void
virPCIDeviceSetRemoveSlot(virPCIDevice * dev,bool remove_slot)1609 virPCIDeviceSetRemoveSlot(virPCIDevice *dev, bool remove_slot)
1610 {
1611     dev->remove_slot = remove_slot;
1612 }
1613 
1614 bool
virPCIDeviceGetReprobe(virPCIDevice * dev)1615 virPCIDeviceGetReprobe(virPCIDevice *dev)
1616 {
1617     return dev->reprobe;
1618 }
1619 
1620 void
virPCIDeviceSetReprobe(virPCIDevice * dev,bool reprobe)1621 virPCIDeviceSetReprobe(virPCIDevice *dev, bool reprobe)
1622 {
1623     dev->reprobe = reprobe;
1624 }
1625 
1626 int
virPCIDeviceSetUsedBy(virPCIDevice * dev,const char * drv_name,const char * dom_name)1627 virPCIDeviceSetUsedBy(virPCIDevice *dev,
1628                       const char *drv_name,
1629                       const char *dom_name)
1630 {
1631     VIR_FREE(dev->used_by_drvname);
1632     VIR_FREE(dev->used_by_domname);
1633     dev->used_by_drvname = g_strdup(drv_name);
1634     dev->used_by_domname = g_strdup(dom_name);
1635 
1636     return 0;
1637 }
1638 
1639 void
virPCIDeviceGetUsedBy(virPCIDevice * dev,const char ** drv_name,const char ** dom_name)1640 virPCIDeviceGetUsedBy(virPCIDevice *dev,
1641                       const char **drv_name,
1642                       const char **dom_name)
1643 {
1644     *drv_name = dev->used_by_drvname;
1645     *dom_name = dev->used_by_domname;
1646 }
1647 
1648 virPCIDeviceList *
virPCIDeviceListNew(void)1649 virPCIDeviceListNew(void)
1650 {
1651     virPCIDeviceList *list;
1652 
1653     if (virPCIInitialize() < 0)
1654         return NULL;
1655 
1656     if (!(list = virObjectLockableNew(virPCIDeviceListClass)))
1657         return NULL;
1658 
1659     return list;
1660 }
1661 
1662 static void
virPCIDeviceListDispose(void * obj)1663 virPCIDeviceListDispose(void *obj)
1664 {
1665     virPCIDeviceList *list = obj;
1666     size_t i;
1667 
1668     for (i = 0; i < list->count; i++) {
1669         virPCIDeviceFree(list->devs[i]);
1670         list->devs[i] = NULL;
1671     }
1672 
1673     list->count = 0;
1674     g_free(list->devs);
1675 }
1676 
1677 int
virPCIDeviceListAdd(virPCIDeviceList * list,virPCIDevice * dev)1678 virPCIDeviceListAdd(virPCIDeviceList *list,
1679                     virPCIDevice *dev)
1680 {
1681     if (virPCIDeviceListFind(list, &dev->address)) {
1682         virReportError(VIR_ERR_INTERNAL_ERROR,
1683                        _("Device %s is already in use"), dev->name);
1684         return -1;
1685     }
1686     VIR_APPEND_ELEMENT(list->devs, list->count, dev);
1687 
1688     return 0;
1689 }
1690 
1691 
1692 /* virPCIDeviceListAddCopy - add a *copy* of the device to this list */
1693 int
virPCIDeviceListAddCopy(virPCIDeviceList * list,virPCIDevice * dev)1694 virPCIDeviceListAddCopy(virPCIDeviceList *list, virPCIDevice *dev)
1695 {
1696     g_autoptr(virPCIDevice) copy = virPCIDeviceCopy(dev);
1697 
1698     if (!copy)
1699         return -1;
1700     if (virPCIDeviceListAdd(list, copy) < 0)
1701         return -1;
1702 
1703     copy = NULL;
1704     return 0;
1705 }
1706 
1707 
1708 virPCIDevice *
virPCIDeviceListGet(virPCIDeviceList * list,int idx)1709 virPCIDeviceListGet(virPCIDeviceList *list,
1710                     int idx)
1711 {
1712     if (idx >= list->count)
1713         return NULL;
1714     if (idx < 0)
1715         return NULL;
1716 
1717     return list->devs[idx];
1718 }
1719 
1720 size_t
virPCIDeviceListCount(virPCIDeviceList * list)1721 virPCIDeviceListCount(virPCIDeviceList *list)
1722 {
1723     return list->count;
1724 }
1725 
1726 virPCIDevice *
virPCIDeviceListStealIndex(virPCIDeviceList * list,int idx)1727 virPCIDeviceListStealIndex(virPCIDeviceList *list,
1728                            int idx)
1729 {
1730     virPCIDevice *ret;
1731 
1732     if (idx < 0 || idx >= list->count)
1733         return NULL;
1734 
1735     ret = list->devs[idx];
1736     VIR_DELETE_ELEMENT(list->devs, idx, list->count);
1737     return ret;
1738 }
1739 
1740 virPCIDevice *
virPCIDeviceListSteal(virPCIDeviceList * list,virPCIDeviceAddress * devAddr)1741 virPCIDeviceListSteal(virPCIDeviceList *list,
1742                       virPCIDeviceAddress *devAddr)
1743 {
1744     return virPCIDeviceListStealIndex(list, virPCIDeviceListFindIndex(list, devAddr));
1745 }
1746 
1747 void
virPCIDeviceListDel(virPCIDeviceList * list,virPCIDeviceAddress * devAddr)1748 virPCIDeviceListDel(virPCIDeviceList *list,
1749                     virPCIDeviceAddress *devAddr)
1750 {
1751     virPCIDeviceFree(virPCIDeviceListSteal(list, devAddr));
1752 }
1753 
1754 int
virPCIDeviceListFindIndex(virPCIDeviceList * list,virPCIDeviceAddress * devAddr)1755 virPCIDeviceListFindIndex(virPCIDeviceList *list,
1756                           virPCIDeviceAddress *devAddr)
1757 {
1758     size_t i;
1759 
1760     for (i = 0; i < list->count; i++) {
1761         virPCIDevice *other = list->devs[i];
1762         if (other->address.domain   == devAddr->domain &&
1763             other->address.bus      == devAddr->bus    &&
1764             other->address.slot     == devAddr->slot   &&
1765             other->address.function == devAddr->function)
1766             return i;
1767     }
1768     return -1;
1769 }
1770 
1771 
1772 virPCIDevice *
virPCIDeviceListFindByIDs(virPCIDeviceList * list,unsigned int domain,unsigned int bus,unsigned int slot,unsigned int function)1773 virPCIDeviceListFindByIDs(virPCIDeviceList *list,
1774                           unsigned int domain,
1775                           unsigned int bus,
1776                           unsigned int slot,
1777                           unsigned int function)
1778 {
1779     size_t i;
1780 
1781     for (i = 0; i < list->count; i++) {
1782         virPCIDevice *other = list->devs[i];
1783         if (other->address.domain   == domain &&
1784             other->address.bus      == bus    &&
1785             other->address.slot     == slot   &&
1786             other->address.function == function)
1787             return list->devs[i];
1788     }
1789     return NULL;
1790 }
1791 
1792 
1793 virPCIDevice *
virPCIDeviceListFind(virPCIDeviceList * list,virPCIDeviceAddress * devAddr)1794 virPCIDeviceListFind(virPCIDeviceList *list, virPCIDeviceAddress *devAddr)
1795 {
1796     int idx;
1797 
1798     if ((idx = virPCIDeviceListFindIndex(list, devAddr)) >= 0)
1799         return list->devs[idx];
1800     else
1801         return NULL;
1802 }
1803 
1804 
virPCIDeviceFileIterate(virPCIDevice * dev,virPCIDeviceFileActor actor,void * opaque)1805 int virPCIDeviceFileIterate(virPCIDevice *dev,
1806                             virPCIDeviceFileActor actor,
1807                             void *opaque)
1808 {
1809     g_autofree char *pcidir = NULL;
1810     g_autoptr(DIR) dir = NULL;
1811     struct dirent *ent;
1812     int direrr;
1813 
1814     pcidir = g_strdup_printf("/sys/bus/pci/devices/" VIR_PCI_DEVICE_ADDRESS_FMT,
1815                              dev->address.domain, dev->address.bus, dev->address.slot,
1816                              dev->address.function);
1817 
1818     if (virDirOpen(&dir, pcidir) < 0)
1819         return -1;
1820 
1821     while ((direrr = virDirRead(dir, &ent, pcidir)) > 0) {
1822         g_autofree char *file = NULL;
1823         /* Device assignment requires:
1824          *   $PCIDIR/config, $PCIDIR/resource, $PCIDIR/resourceNNN,
1825          *   $PCIDIR/rom, $PCIDIR/reset, $PCIDIR/vendor, $PCIDIR/device
1826          */
1827         if (STREQ(ent->d_name, "config") ||
1828             STRPREFIX(ent->d_name, "resource") ||
1829             STREQ(ent->d_name, "rom") ||
1830             STREQ(ent->d_name, "vendor") ||
1831             STREQ(ent->d_name, "device") ||
1832             STREQ(ent->d_name, "reset")) {
1833             file = g_strdup_printf("%s/%s", pcidir, ent->d_name);
1834             if ((actor)(dev, file, opaque) < 0)
1835                 return -1;
1836         }
1837     }
1838     if (direrr < 0)
1839         return -1;
1840 
1841     return 0;
1842 }
1843 
1844 
1845 /* virPCIDeviceAddressIOMMUGroupIterate:
1846  *   Call @actor for all devices in the same iommu_group as orig
1847  *   (including orig itself) Even if there is no iommu_group for the
1848  *   device, call @actor once for orig.
1849  */
1850 int
virPCIDeviceAddressIOMMUGroupIterate(virPCIDeviceAddress * orig,virPCIDeviceAddressActor actor,void * opaque)1851 virPCIDeviceAddressIOMMUGroupIterate(virPCIDeviceAddress *orig,
1852                                      virPCIDeviceAddressActor actor,
1853                                      void *opaque)
1854 {
1855     g_autofree char *groupPath = NULL;
1856     g_autoptr(DIR) groupDir = NULL;
1857     struct dirent *ent;
1858     int direrr;
1859 
1860     groupPath = g_strdup_printf(PCI_SYSFS "devices/" VIR_PCI_DEVICE_ADDRESS_FMT "/iommu_group/devices",
1861                                 orig->domain, orig->bus, orig->slot, orig->function);
1862 
1863     if (virDirOpenQuiet(&groupDir, groupPath) < 0) {
1864         /* just process the original device, nothing more */
1865         return (actor)(orig, opaque);
1866     }
1867 
1868     while ((direrr = virDirRead(groupDir, &ent, groupPath)) > 0) {
1869         virPCIDeviceAddress newDev;
1870 
1871         if (virPCIDeviceAddressParse(ent->d_name, &newDev) < 0) {
1872             virReportError(VIR_ERR_INTERNAL_ERROR,
1873                            _("Found invalid device link '%s' in '%s'"),
1874                            ent->d_name, groupPath);
1875             return -1;
1876         }
1877 
1878         if ((actor)(&newDev, opaque) < 0)
1879             return -1;
1880     }
1881     if (direrr < 0)
1882         return -1;
1883 
1884     return 0;
1885 }
1886 
1887 
1888 static int
virPCIDeviceGetIOMMUGroupAddOne(virPCIDeviceAddress * newDevAddr,void * opaque)1889 virPCIDeviceGetIOMMUGroupAddOne(virPCIDeviceAddress *newDevAddr, void *opaque)
1890 {
1891     virPCIDeviceList *groupList = opaque;
1892     g_autoptr(virPCIDevice) newDev = NULL;
1893 
1894     if (!(newDev = virPCIDeviceNew(newDevAddr)))
1895         return -1;
1896 
1897     if (virPCIDeviceListAdd(groupList, newDev) < 0)
1898         return -1;
1899 
1900     newDev = NULL; /* it's now on the list */
1901     return 0;
1902 }
1903 
1904 
1905 /*
1906  * virPCIDeviceGetIOMMUGroupList - return a virPCIDeviceList containing
1907  * all of the devices in the same iommu_group as @dev.
1908  *
1909  * Return the new list, or NULL on failure
1910  */
1911 virPCIDeviceList *
virPCIDeviceGetIOMMUGroupList(virPCIDevice * dev)1912 virPCIDeviceGetIOMMUGroupList(virPCIDevice *dev)
1913 {
1914     virPCIDeviceList *groupList = virPCIDeviceListNew();
1915 
1916     if (!groupList)
1917         goto error;
1918 
1919     if (virPCIDeviceAddressIOMMUGroupIterate(&(dev->address),
1920                                              virPCIDeviceGetIOMMUGroupAddOne,
1921                                              groupList) < 0)
1922         goto error;
1923 
1924     return groupList;
1925 
1926  error:
1927     virObjectUnref(groupList);
1928     return NULL;
1929 }
1930 
1931 
1932 typedef struct {
1933     virPCIDeviceAddress ***iommuGroupDevices;
1934     size_t *nIommuGroupDevices;
1935 } virPCIDeviceAddressList;
1936 
1937 static int
virPCIGetIOMMUGroupAddressesAddOne(virPCIDeviceAddress * newDevAddr,void * opaque)1938 virPCIGetIOMMUGroupAddressesAddOne(virPCIDeviceAddress *newDevAddr, void *opaque)
1939 {
1940     virPCIDeviceAddressList *addrList = opaque;
1941     g_autofree virPCIDeviceAddress *copyAddr = NULL;
1942 
1943     /* make a copy to insert onto the list */
1944     copyAddr = g_new0(virPCIDeviceAddress, 1);
1945 
1946     *copyAddr = *newDevAddr;
1947 
1948     VIR_APPEND_ELEMENT(*addrList->iommuGroupDevices,
1949                        *addrList->nIommuGroupDevices, copyAddr);
1950 
1951     return 0;
1952 }
1953 
1954 
1955 /*
1956  * virPCIDeviceAddressGetIOMMUGroupAddresses - return a
1957  * virPCIDeviceList containing all of the devices in the same
1958  * iommu_group as @dev.
1959  *
1960  * Return the new list, or NULL on failure
1961  */
1962 int
virPCIDeviceAddressGetIOMMUGroupAddresses(virPCIDeviceAddress * devAddr,virPCIDeviceAddress *** iommuGroupDevices,size_t * nIommuGroupDevices)1963 virPCIDeviceAddressGetIOMMUGroupAddresses(virPCIDeviceAddress *devAddr,
1964                                           virPCIDeviceAddress ***iommuGroupDevices,
1965                                           size_t *nIommuGroupDevices)
1966 {
1967     virPCIDeviceAddressList addrList = { iommuGroupDevices,
1968                                          nIommuGroupDevices };
1969 
1970     if (virPCIDeviceAddressIOMMUGroupIterate(devAddr,
1971                                              virPCIGetIOMMUGroupAddressesAddOne,
1972                                              &addrList) < 0)
1973         return -1;
1974 
1975     return 0;
1976 }
1977 
1978 
1979 /* virPCIDeviceAddressGetIOMMUGroupNum - return the group number of
1980  * this PCI device's iommu_group, or -2 if there is no iommu_group for
1981  * the device (or -1 if there was any other error)
1982  */
1983 int
virPCIDeviceAddressGetIOMMUGroupNum(virPCIDeviceAddress * addr)1984 virPCIDeviceAddressGetIOMMUGroupNum(virPCIDeviceAddress *addr)
1985 {
1986     g_autofree char *devName = NULL;
1987     g_autofree char *devPath = NULL;
1988     g_autofree char *groupPath = NULL;
1989     g_autofree char *groupNumStr = NULL;
1990     unsigned int groupNum;
1991 
1992     devName = virPCIDeviceAddressAsString(addr);
1993 
1994     devPath = virPCIFile(devName, "iommu_group");
1995 
1996     if (virFileIsLink(devPath) != 1)
1997         return -2;
1998     if (virFileResolveLink(devPath, &groupPath) < 0) {
1999         virReportError(VIR_ERR_INTERNAL_ERROR,
2000                        _("Unable to resolve device %s iommu_group symlink %s"),
2001                        devName, devPath);
2002         return -1;
2003     }
2004 
2005     groupNumStr = g_path_get_basename(groupPath);
2006     if (virStrToLong_ui(groupNumStr, NULL, 10, &groupNum) < 0) {
2007         virReportError(VIR_ERR_INTERNAL_ERROR,
2008                        _("device %s iommu_group symlink %s has "
2009                          "invalid group number %s"),
2010                        devName, groupPath, groupNumStr);
2011         return -1;
2012     }
2013 
2014     return groupNum;
2015 }
2016 
2017 
2018 char *
virPCIDeviceAddressGetIOMMUGroupDev(const virPCIDeviceAddress * devAddr)2019 virPCIDeviceAddressGetIOMMUGroupDev(const virPCIDeviceAddress *devAddr)
2020 {
2021     g_autoptr(virPCIDevice) pci = NULL;
2022 
2023     if (!(pci = virPCIDeviceNew(devAddr)))
2024         return NULL;
2025 
2026     return virPCIDeviceGetIOMMUGroupDev(pci);
2027 }
2028 
2029 
2030 /* virPCIDeviceGetIOMMUGroupDev - return the name of the device used
2031  * to control this PCI device's group (e.g. "/dev/vfio/15")
2032  */
2033 char *
virPCIDeviceGetIOMMUGroupDev(virPCIDevice * dev)2034 virPCIDeviceGetIOMMUGroupDev(virPCIDevice *dev)
2035 {
2036     g_autofree char *devPath = NULL;
2037     g_autofree char *groupPath = NULL;
2038     g_autofree char *groupFile = NULL;
2039 
2040     devPath = virPCIFile(dev->name, "iommu_group");
2041 
2042     if (virFileIsLink(devPath) != 1) {
2043         virReportError(VIR_ERR_INTERNAL_ERROR,
2044                        _("Invalid device %s iommu_group file %s is not a symlink"),
2045                        dev->name, devPath);
2046         return NULL;
2047     }
2048     if (virFileResolveLink(devPath, &groupPath) < 0) {
2049         virReportError(VIR_ERR_INTERNAL_ERROR,
2050                        _("Unable to resolve device %s iommu_group symlink %s"),
2051                        dev->name, devPath);
2052         return NULL;
2053     }
2054     groupFile = g_path_get_basename(groupPath);
2055 
2056     return g_strdup_printf("/dev/vfio/%s", groupFile);
2057 }
2058 
2059 static int
virPCIDeviceDownstreamLacksACS(virPCIDevice * dev)2060 virPCIDeviceDownstreamLacksACS(virPCIDevice *dev)
2061 {
2062     uint16_t flags;
2063     uint16_t ctrl;
2064     unsigned int pos;
2065     int fd;
2066     int ret = 0;
2067     uint16_t device_class;
2068 
2069     if ((fd = virPCIDeviceConfigOpen(dev)) < 0)
2070         return -1;
2071 
2072     if (virPCIDeviceInit(dev, fd) < 0) {
2073         ret = -1;
2074         goto cleanup;
2075     }
2076 
2077     if (virPCIDeviceReadClass(dev, &device_class) < 0)
2078         goto cleanup;
2079 
2080     pos = dev->pcie_cap_pos;
2081     if (!pos || device_class != PCI_CLASS_BRIDGE_PCI)
2082         goto cleanup;
2083 
2084     flags = virPCIDeviceRead16(dev, fd, pos + PCI_EXP_FLAGS);
2085     if (((flags & PCI_EXP_FLAGS_TYPE) >> 4) != PCI_EXP_TYPE_DOWNSTREAM)
2086         goto cleanup;
2087 
2088     pos = virPCIDeviceFindExtendedCapabilityOffset(dev, fd, PCI_EXT_CAP_ID_ACS);
2089     if (!pos) {
2090         VIR_DEBUG("%s %s: downstream port lacks ACS", dev->id, dev->name);
2091         ret = 1;
2092         goto cleanup;
2093     }
2094 
2095     ctrl = virPCIDeviceRead16(dev, fd, pos + PCI_EXT_ACS_CTRL);
2096     if ((ctrl & PCI_EXT_CAP_ACS_ENABLED) != PCI_EXT_CAP_ACS_ENABLED) {
2097         VIR_DEBUG("%s %s: downstream port has ACS disabled",
2098                   dev->id, dev->name);
2099         ret = 1;
2100         goto cleanup;
2101     }
2102 
2103  cleanup:
2104     virPCIDeviceConfigClose(dev, fd);
2105     return ret;
2106 }
2107 
2108 static int
virPCIDeviceIsBehindSwitchLackingACS(virPCIDevice * dev)2109 virPCIDeviceIsBehindSwitchLackingACS(virPCIDevice *dev)
2110 {
2111     g_autoptr(virPCIDevice) parent = NULL;
2112 
2113     if (virPCIDeviceGetParent(dev, &parent) < 0)
2114         return -1;
2115     if (!parent) {
2116         /* if we have no parent, and this is the root bus, ACS doesn't come
2117          * into play since devices on the root bus can't P2P without going
2118          * through the root IOMMU.
2119          */
2120         if (dev->address.bus == 0) {
2121             return 0;
2122         } else {
2123             virReportError(VIR_ERR_INTERNAL_ERROR,
2124                            _("Failed to find parent device for %s"),
2125                            dev->name);
2126             return -1;
2127         }
2128     }
2129 
2130     /* XXX we should rather fail when we can't find device's parent and
2131      * stop the loop when we get to root instead of just stopping when no
2132      * parent can be found
2133      */
2134     do {
2135         g_autoptr(virPCIDevice) tmp = NULL;
2136         int acs;
2137         int ret;
2138 
2139         acs = virPCIDeviceDownstreamLacksACS(parent);
2140 
2141         if (acs) {
2142             if (acs < 0)
2143                 return -1;
2144             else
2145                 return 1;
2146         }
2147 
2148         tmp = g_steal_pointer(&parent);
2149         ret = virPCIDeviceGetParent(tmp, &parent);
2150         if (ret < 0)
2151             return -1;
2152     } while (parent);
2153 
2154     return 0;
2155 }
2156 
virPCIDeviceIsAssignable(virPCIDevice * dev,int strict_acs_check)2157 int virPCIDeviceIsAssignable(virPCIDevice *dev,
2158                              int strict_acs_check)
2159 {
2160     int ret;
2161 
2162     /* XXX This could be a great place to actually check that a non-managed
2163      * device isn't in use, e.g. by checking that device is either un-bound
2164      * or bound to a stub driver.
2165      */
2166 
2167     ret = virPCIDeviceIsBehindSwitchLackingACS(dev);
2168     if (ret < 0)
2169         return 0;
2170 
2171     if (ret) {
2172         if (!strict_acs_check) {
2173             VIR_DEBUG("%s %s: strict ACS check disabled; device assignment allowed",
2174                       dev->id, dev->name);
2175         } else {
2176             virReportError(VIR_ERR_INTERNAL_ERROR,
2177                            _("Device %s is behind a switch lacking ACS and "
2178                              "cannot be assigned"),
2179                            dev->name);
2180             return 0;
2181         }
2182     }
2183 
2184     return 1;
2185 }
2186 
2187 static int
logStrToLong_ui(char const * s,char ** end_ptr,int base,unsigned int * result)2188 logStrToLong_ui(char const *s,
2189                 char **end_ptr,
2190                 int base,
2191                 unsigned int *result)
2192 {
2193     int ret = 0;
2194 
2195     ret = virStrToLong_ui(s, end_ptr, base, result);
2196     if (ret != 0)
2197         VIR_ERROR(_("Failed to convert '%s' to unsigned int"), s);
2198     return ret;
2199 }
2200 
2201 int
virPCIDeviceAddressParse(char * address,virPCIDeviceAddress * bdf)2202 virPCIDeviceAddressParse(char *address,
2203                          virPCIDeviceAddress *bdf)
2204 {
2205     char *p = NULL;
2206 
2207     if ((address == NULL) || (logStrToLong_ui(address, &p, 16,
2208                                               &bdf->domain) == -1)) {
2209         return -1;
2210     }
2211 
2212     if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
2213                                         &bdf->bus) == -1)) {
2214         return -1;
2215     }
2216 
2217     if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
2218                                         &bdf->slot) == -1)) {
2219         return -1;
2220     }
2221 
2222     if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
2223                                         &bdf->function) == -1)) {
2224         return -1;
2225     }
2226 
2227     return 0;
2228 }
2229 
2230 
2231 bool
virZPCIDeviceAddressIsIncomplete(const virZPCIDeviceAddress * addr)2232 virZPCIDeviceAddressIsIncomplete(const virZPCIDeviceAddress *addr)
2233 {
2234     return !addr->uid.isSet || !addr->fid.isSet;
2235 }
2236 
2237 
2238 bool
virZPCIDeviceAddressIsPresent(const virZPCIDeviceAddress * addr)2239 virZPCIDeviceAddressIsPresent(const virZPCIDeviceAddress *addr)
2240 {
2241     return addr->uid.isSet || addr->fid.isSet;
2242 }
2243 
2244 
2245 void
virPCIVirtualFunctionListFree(virPCIVirtualFunctionList * list)2246 virPCIVirtualFunctionListFree(virPCIVirtualFunctionList *list)
2247 {
2248     size_t i;
2249 
2250     if (!list)
2251         return;
2252 
2253     for (i = 0; i < list->nfunctions; i++) {
2254         g_free(list->functions[i].addr);
2255         g_free(list->functions[i].ifname);
2256     }
2257 
2258     g_free(list);
2259 }
2260 
2261 
2262 int
virPCIGetVirtualFunctions(const char * sysfs_path,virPCIVirtualFunctionList ** vfs)2263 virPCIGetVirtualFunctions(const char *sysfs_path,
2264                           virPCIVirtualFunctionList **vfs)
2265 {
2266     return virPCIGetVirtualFunctionsFull(sysfs_path, vfs, NULL);
2267 }
2268 
2269 
2270 #ifdef __linux__
2271 
2272 virPCIDeviceAddress *
virPCIGetDeviceAddressFromSysfsLink(const char * device_link)2273 virPCIGetDeviceAddressFromSysfsLink(const char *device_link)
2274 {
2275     g_autofree virPCIDeviceAddress *bdf = NULL;
2276     g_autofree char *config_address = NULL;
2277     g_autofree char *device_path = NULL;
2278 
2279     if (!virFileExists(device_link)) {
2280         VIR_DEBUG("'%s' does not exist", device_link);
2281         return NULL;
2282     }
2283 
2284     device_path = virFileCanonicalizePath(device_link);
2285     if (device_path == NULL) {
2286         virReportSystemError(errno,
2287                              _("Failed to resolve device link '%s'"),
2288                              device_link);
2289         return NULL;
2290     }
2291 
2292     config_address = g_path_get_basename(device_path);
2293     bdf = g_new0(virPCIDeviceAddress, 1);
2294 
2295     if (virPCIDeviceAddressParse(config_address, bdf) < 0) {
2296         virReportError(VIR_ERR_INTERNAL_ERROR,
2297                        _("Failed to parse PCI config address '%s'"),
2298                        config_address);
2299         return NULL;
2300     }
2301 
2302     return g_steal_pointer(&bdf);
2303 }
2304 
2305 /**
2306  * virPCIGetPhysicalFunction:
2307  * @vf_sysfs_path: sysfs path for the virtual function
2308  * @pf: where to store the physical function's address
2309  *
2310  * Given @vf_sysfs_path, this function will store the pointer
2311  * to a newly-allocated virPCIDeviceAddress in @pf.
2312  *
2313  * @pf might be NULL if @vf_sysfs_path does not point to a
2314  * virtual function. If it's not NULL, then it should be
2315  * freed by the caller when no longer needed.
2316  *
2317  * Returns: >=0 on success, <0 on failure
2318  */
2319 int
virPCIGetPhysicalFunction(const char * vf_sysfs_path,virPCIDeviceAddress ** pf)2320 virPCIGetPhysicalFunction(const char *vf_sysfs_path,
2321                           virPCIDeviceAddress **pf)
2322 {
2323     g_autofree char *device_link = NULL;
2324 
2325     *pf = NULL;
2326 
2327     virBuildPath(&device_link, vf_sysfs_path, "physfn");
2328 
2329     if ((*pf = virPCIGetDeviceAddressFromSysfsLink(device_link))) {
2330         VIR_DEBUG("PF for VF device '%s': " VIR_PCI_DEVICE_ADDRESS_FMT,
2331                   vf_sysfs_path,
2332                   (*pf)->domain, (*pf)->bus, (*pf)->slot, (*pf)->function);
2333     }
2334 
2335     return 0;
2336 }
2337 
2338 
2339 /**
2340  * virPCIGetVirtualFunctionsFull:
2341  * @sysfs_path: path to physical function sysfs entry
2342  * @vfs: filled with the virtual function data
2343  * @pfPhysPortID: Optional physical port id. If provided the network interface
2344  *                name of the VFs is queried too.
2345  *
2346  *
2347  * Returns virtual functions of a physical function.
2348  */
2349 int
virPCIGetVirtualFunctionsFull(const char * sysfs_path,virPCIVirtualFunctionList ** vfs,const char * pfPhysPortID)2350 virPCIGetVirtualFunctionsFull(const char *sysfs_path,
2351                               virPCIVirtualFunctionList **vfs,
2352                               const char *pfPhysPortID)
2353 {
2354     g_autofree char *totalvfs_file = NULL;
2355     g_autofree char *totalvfs_str = NULL;
2356     g_autoptr(virPCIVirtualFunctionList) list = g_new0(virPCIVirtualFunctionList, 1);
2357 
2358     *vfs = NULL;
2359 
2360     totalvfs_file = g_strdup_printf("%s/sriov_totalvfs", sysfs_path);
2361     if (virFileExists(totalvfs_file)) {
2362         char *end = NULL; /* so that terminating \n doesn't create error */
2363         unsigned long long maxfunctions = 0;
2364 
2365         if (virFileReadAll(totalvfs_file, 16, &totalvfs_str) < 0)
2366             return -1;
2367         if (virStrToLong_ull(totalvfs_str, &end, 10, &maxfunctions) < 0) {
2368             virReportError(VIR_ERR_INTERNAL_ERROR,
2369                            _("Unrecognized value in %s: %s"),
2370                            totalvfs_file, totalvfs_str);
2371             return -1;
2372         }
2373         list->maxfunctions = maxfunctions;
2374     }
2375 
2376     do {
2377         g_autofree char *device_link = NULL;
2378         struct virPCIVirtualFunction fnc = { NULL, NULL };
2379 
2380         /* look for virtfn%d links until one isn't found */
2381         device_link = g_strdup_printf("%s/virtfn%zu", sysfs_path, list->nfunctions);
2382 
2383         if (!virFileExists(device_link))
2384             break;
2385 
2386         if (!(fnc.addr = virPCIGetDeviceAddressFromSysfsLink(device_link))) {
2387             virReportError(VIR_ERR_INTERNAL_ERROR,
2388                            _("Failed to get SRIOV function from device link '%s'"),
2389                            device_link);
2390             return -1;
2391         }
2392 
2393         if (pfPhysPortID) {
2394             if (virPCIGetNetName(device_link, 0, pfPhysPortID, &fnc.ifname) < 0) {
2395                 g_free(fnc.addr);
2396                 return -1;
2397             }
2398         }
2399 
2400         VIR_APPEND_ELEMENT(list->functions, list->nfunctions, fnc);
2401     } while (1);
2402 
2403     VIR_DEBUG("Found %zu virtual functions for %s", list->nfunctions, sysfs_path);
2404 
2405     *vfs = g_steal_pointer(&list);
2406     return 0;
2407 }
2408 
2409 
2410 /*
2411  * Returns 1 if vf device is a virtual function, 0 if not, -1 on error
2412  */
2413 int
virPCIIsVirtualFunction(const char * vf_sysfs_device_link)2414 virPCIIsVirtualFunction(const char *vf_sysfs_device_link)
2415 {
2416     g_autofree char *vf_sysfs_physfn_link = NULL;
2417 
2418     vf_sysfs_physfn_link = g_strdup_printf("%s/physfn", vf_sysfs_device_link);
2419 
2420     return virFileExists(vf_sysfs_physfn_link);
2421 }
2422 
2423 /*
2424  * Returns the sriov virtual function index of vf given its pf
2425  */
2426 int
virPCIGetVirtualFunctionIndex(const char * pf_sysfs_device_link,const char * vf_sysfs_device_link,int * vf_index)2427 virPCIGetVirtualFunctionIndex(const char *pf_sysfs_device_link,
2428                               const char *vf_sysfs_device_link,
2429                               int *vf_index)
2430 {
2431     size_t i;
2432     g_autofree virPCIDeviceAddress *vf_bdf = NULL;
2433     g_autoptr(virPCIVirtualFunctionList) virt_fns = NULL;
2434 
2435     if (!(vf_bdf = virPCIGetDeviceAddressFromSysfsLink(vf_sysfs_device_link)))
2436         return -1;
2437 
2438     if (virPCIGetVirtualFunctions(pf_sysfs_device_link, &virt_fns) < 0) {
2439         virReportError(VIR_ERR_INTERNAL_ERROR,
2440                        _("Error getting physical function's '%s' "
2441                          "virtual_functions"), pf_sysfs_device_link);
2442         return -1;
2443     }
2444 
2445     for (i = 0; i < virt_fns->nfunctions; i++) {
2446         if (virPCIDeviceAddressEqual(vf_bdf, virt_fns->functions[i].addr)) {
2447             *vf_index = i;
2448             return 0;
2449         }
2450     }
2451 
2452     return -1;
2453 }
2454 
2455 /*
2456  * Returns a path to the PCI sysfs file given the BDF of the PCI function
2457  */
2458 
2459 int
virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddress * addr,char ** pci_sysfs_device_link)2460 virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddress *addr,
2461                                 char **pci_sysfs_device_link)
2462 {
2463     *pci_sysfs_device_link = g_strdup_printf(PCI_SYSFS "devices/" VIR_PCI_DEVICE_ADDRESS_FMT, addr->domain,
2464                                              addr->bus, addr->slot, addr->function);
2465     return 0;
2466 }
2467 
2468 /**
2469  * virPCIGetNetName:
2470  * @device_link_sysfs_path: sysfs path to the PCI device
2471  * @idx: used to choose which netdev when there are several
2472  *       (ignored if physPortID is set or physPortName is available)
2473  * @physPortID: match this string in the netdev's phys_port_id
2474  *       (or NULL to ignore and use phys_port_name or idx instead)
2475  * @netname: used to return the name of the netdev
2476  *       (set to NULL (but returns success) if there is no netdev)
2477  *
2478  * Returns 0 on success, -1 on error (error has been logged)
2479  */
2480 int
virPCIGetNetName(const char * device_link_sysfs_path,size_t idx,const char * physPortID,char ** netname)2481 virPCIGetNetName(const char *device_link_sysfs_path,
2482                  size_t idx,
2483                  const char *physPortID,
2484                  char **netname)
2485 {
2486     g_autofree char *pcidev_sysfs_net_path = NULL;
2487     g_autofree char *firstEntryName = NULL;
2488     g_autoptr(DIR) dir = NULL;
2489     struct dirent *entry = NULL;
2490     size_t i = 0;
2491 
2492     *netname = NULL;
2493 
2494     virBuildPath(&pcidev_sysfs_net_path, device_link_sysfs_path, "net");
2495 
2496     if (virDirOpenQuiet(&dir, pcidev_sysfs_net_path) < 0) {
2497         /* this *isn't* an error - caller needs to check for netname == NULL */
2498         return 0;
2499     }
2500 
2501     while (virDirRead(dir, &entry, pcidev_sysfs_net_path) > 0) {
2502         /* save the first entry we find to use as a failsafe
2503          * in case we don't match the phys_port_id. This is
2504          * needed because some NIC drivers (e.g. i40e)
2505          * implement phys_port_id for PFs, but not for VFs
2506          */
2507         if (!firstEntryName)
2508             firstEntryName = g_strdup(entry->d_name);
2509 
2510         /* if the caller sent a physPortID, compare it to the
2511          * physportID of this netdev. If not, look for entry[idx].
2512          */
2513         if (physPortID) {
2514             g_autofree char *thisPhysPortID = NULL;
2515 
2516             if (virNetDevGetPhysPortID(entry->d_name, &thisPhysPortID) < 0)
2517                 return -1;
2518 
2519             /* if this one doesn't match, keep looking */
2520             if (STRNEQ_NULLABLE(physPortID, thisPhysPortID))
2521                 continue;
2522 
2523         } else {
2524             /* Most switch devices use phys_port_name instead of
2525              * phys_port_id.
2526              * NOTE: VFs' representors net devices can be linked to PF's PCI
2527              * device, which mean that there'll be multiple net devices
2528              * instances and to get a proper net device need to match on
2529              * specific regex.
2530              * To get PF netdev, for ex., used following regex:
2531              * "(p[0-9]+$)|(p[0-9]+s[0-9]+$)"
2532              * or to get exact VF's netdev next regex is used:
2533              * "pf0vf1$"
2534              */
2535             g_autofree char *thisPhysPortName = NULL;
2536 
2537             if (virNetDevGetPhysPortName(entry->d_name, &thisPhysPortName) < 0)
2538                 return -1;
2539 
2540             if (thisPhysPortName) {
2541 
2542                 /* if this one doesn't match, keep looking */
2543                 if (!virStringMatch(thisPhysPortName, VIR_PF_PHYS_PORT_NAME_REGEX))
2544                     continue;
2545 
2546             } else {
2547 
2548                 if (i++ < idx)
2549                     continue;
2550             }
2551         }
2552 
2553         *netname = g_strdup(entry->d_name);
2554         return 0;
2555     }
2556 
2557     if (firstEntryName) {
2558         /* we didn't match the provided phys_port_id / find a
2559          * phys_port_name matching VIR_PF_PHYS_PORT_NAME_REGEX / find
2560          * as many net devices as the value of idx, but this is
2561          * probably because phys_port_id / phys_port_name isn't
2562          * implemented for this NIC driver, so just return the first
2563          * (probably only) netname we found.
2564          */
2565         *netname = g_steal_pointer(&firstEntryName);
2566         return 0;
2567     }
2568 
2569     virReportError(VIR_ERR_INTERNAL_ERROR,
2570                    _("Could not find any network device under PCI device at %s"),
2571                    device_link_sysfs_path);
2572     return -1;
2573 }
2574 
2575 int
virPCIGetVirtualFunctionInfo(const char * vf_sysfs_device_path,int pfNetDevIdx,char ** pfname,int * vf_index)2576 virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path,
2577                              int pfNetDevIdx,
2578                              char **pfname,
2579                              int *vf_index)
2580 {
2581     g_autofree virPCIDeviceAddress *pf_config_address = NULL;
2582     g_autofree char *pf_sysfs_device_path = NULL;
2583     g_autofree char *vfname = NULL;
2584     g_autofree char *vfPhysPortID = NULL;
2585 
2586     if (virPCIGetPhysicalFunction(vf_sysfs_device_path, &pf_config_address) < 0)
2587         return -1;
2588 
2589     if (!pf_config_address)
2590         return -1;
2591 
2592     if (virPCIDeviceAddressGetSysfsFile(pf_config_address,
2593                                         &pf_sysfs_device_path) < 0) {
2594         return -1;
2595     }
2596 
2597     if (virPCIGetVirtualFunctionIndex(pf_sysfs_device_path,
2598                                       vf_sysfs_device_path, vf_index) < 0) {
2599         return -1;
2600     }
2601 
2602     /* If the caller hasn't asked for a specific pfNetDevIdx, and VF
2603      * is bound to a netdev, learn that netdev's phys_port_id (if
2604      * available). This can be used to disambiguate when the PF has
2605      * multiple netdevs. If the VF isn't bound to a netdev, then we
2606      * return netdev[pfNetDevIdx] on the PF, which may or may not be
2607      * correct.
2608      */
2609     if (pfNetDevIdx == -1) {
2610         if (virPCIGetNetName(vf_sysfs_device_path, 0, NULL, &vfname) < 0)
2611             return -1;
2612 
2613         if (vfname) {
2614             if (virNetDevGetPhysPortID(vfname, &vfPhysPortID) < 0)
2615                 return -1;
2616         }
2617         pfNetDevIdx = 0;
2618     }
2619 
2620     if (virPCIGetNetName(pf_sysfs_device_path,
2621                          pfNetDevIdx, vfPhysPortID, pfname) < 0) {
2622         return -1;
2623     }
2624 
2625     if (!*pfname) {
2626         /* this shouldn't be possible. A VF can't exist unless its
2627          * PF device is bound to a network driver
2628          */
2629         virReportError(VIR_ERR_INTERNAL_ERROR,
2630                        _("The PF device for VF %s has no network device name"),
2631                        vf_sysfs_device_path);
2632         return -1;
2633     }
2634 
2635     return 0;
2636 }
2637 
2638 
2639 bool
virPCIDeviceHasVPD(virPCIDevice * dev)2640 virPCIDeviceHasVPD(virPCIDevice *dev)
2641 {
2642     g_autofree char *vpdPath = NULL;
2643 
2644     vpdPath = virPCIFile(dev->name, "vpd");
2645     if (!virFileExists(vpdPath)) {
2646         VIR_INFO("Device VPD file does not exist %s", vpdPath);
2647         return false;
2648     } else if (!virFileIsRegular(vpdPath)) {
2649         VIR_WARN("VPD path does not point to a regular file %s", vpdPath);
2650         return false;
2651     }
2652     return true;
2653 }
2654 
2655 /**
2656  * virPCIDeviceGetVPD:
2657  * @dev: a PCI device to get a PCI VPD for.
2658  *
2659  * Obtain a PCI device's Vital Product Data (VPD). VPD is optional in
2660  * both PCI Local Bus and PCIe specifications so there is no guarantee it
2661  * will be there for a particular device.
2662  *
2663  * Returns: a pointer to virPCIVPDResource which needs to be freed by the caller
2664  * or NULL if getting it failed for some reason (e.g. invalid format, I/O error).
2665  */
2666 virPCIVPDResource *
virPCIDeviceGetVPD(virPCIDevice * dev)2667 virPCIDeviceGetVPD(virPCIDevice *dev)
2668 {
2669     g_autofree char *vpdPath = NULL;
2670     int fd;
2671     g_autoptr(virPCIVPDResource) res = NULL;
2672 
2673     vpdPath = virPCIFile(dev->name, "vpd");
2674     if (!virPCIDeviceHasVPD(dev)) {
2675         virReportError(VIR_ERR_INTERNAL_ERROR, _("Device %s does not have a VPD"),
2676                 virPCIDeviceGetName(dev));
2677         return NULL;
2678     }
2679     if ((fd = open(vpdPath, O_RDONLY)) < 0) {
2680         virReportSystemError(-fd, _("Failed to open a VPD file '%s'"), vpdPath);
2681         return NULL;
2682     }
2683     res = virPCIVPDParse(fd);
2684 
2685     if (VIR_CLOSE(fd) < 0) {
2686         virReportSystemError(errno, _("Unable to close the VPD file, fd: %d"), fd);
2687         return NULL;
2688     }
2689 
2690     return g_steal_pointer(&res);
2691 }
2692 
2693 #else
2694 static const char *unsupported = N_("not supported on non-linux platforms");
2695 
2696 virPCIDeviceAddress *
virPCIGetDeviceAddressFromSysfsLink(const char * device_link G_GNUC_UNUSED)2697 virPCIGetDeviceAddressFromSysfsLink(const char *device_link G_GNUC_UNUSED)
2698 {
2699     virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2700     return NULL;
2701 }
2702 
2703 
2704 int
virPCIGetPhysicalFunction(const char * vf_sysfs_path G_GNUC_UNUSED,virPCIDeviceAddress ** pf G_GNUC_UNUSED)2705 virPCIGetPhysicalFunction(const char *vf_sysfs_path G_GNUC_UNUSED,
2706                           virPCIDeviceAddress **pf G_GNUC_UNUSED)
2707 {
2708     virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2709     return -1;
2710 }
2711 
2712 int
virPCIGetVirtualFunctionsFull(const char * sysfs_path G_GNUC_UNUSED,virPCIVirtualFunctionList ** vfs G_GNUC_UNUSED,const char * pfPhysPortID G_GNUC_UNUSED)2713 virPCIGetVirtualFunctionsFull(const char *sysfs_path G_GNUC_UNUSED,
2714                               virPCIVirtualFunctionList **vfs G_GNUC_UNUSED,
2715                               const char *pfPhysPortID G_GNUC_UNUSED)
2716 {
2717     virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2718     return -1;
2719 }
2720 
2721 int
virPCIIsVirtualFunction(const char * vf_sysfs_device_link G_GNUC_UNUSED)2722 virPCIIsVirtualFunction(const char *vf_sysfs_device_link G_GNUC_UNUSED)
2723 {
2724     virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2725     return -1;
2726 }
2727 
2728 int
virPCIGetVirtualFunctionIndex(const char * pf_sysfs_device_link G_GNUC_UNUSED,const char * vf_sysfs_device_link G_GNUC_UNUSED,int * vf_index G_GNUC_UNUSED)2729 virPCIGetVirtualFunctionIndex(const char *pf_sysfs_device_link G_GNUC_UNUSED,
2730                               const char *vf_sysfs_device_link G_GNUC_UNUSED,
2731                               int *vf_index G_GNUC_UNUSED)
2732 {
2733     virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2734     return -1;
2735 
2736 }
2737 
2738 
2739 int
virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddress * dev G_GNUC_UNUSED,char ** pci_sysfs_device_link G_GNUC_UNUSED)2740 virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddress *dev G_GNUC_UNUSED,
2741                                 char **pci_sysfs_device_link G_GNUC_UNUSED)
2742 {
2743     virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2744     return -1;
2745 }
2746 
2747 int
virPCIGetNetName(const char * device_link_sysfs_path G_GNUC_UNUSED,size_t idx G_GNUC_UNUSED,const char * physPortID G_GNUC_UNUSED,char ** netname G_GNUC_UNUSED)2748 virPCIGetNetName(const char *device_link_sysfs_path G_GNUC_UNUSED,
2749                  size_t idx G_GNUC_UNUSED,
2750                  const char *physPortID G_GNUC_UNUSED,
2751                  char **netname G_GNUC_UNUSED)
2752 {
2753     virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2754     return -1;
2755 }
2756 
2757 int
virPCIGetVirtualFunctionInfo(const char * vf_sysfs_device_path G_GNUC_UNUSED,int pfNetDevIdx G_GNUC_UNUSED,char ** pfname G_GNUC_UNUSED,int * vf_index G_GNUC_UNUSED)2758 virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path G_GNUC_UNUSED,
2759                              int pfNetDevIdx G_GNUC_UNUSED,
2760                              char **pfname G_GNUC_UNUSED,
2761                              int *vf_index G_GNUC_UNUSED)
2762 {
2763     virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2764     return -1;
2765 }
2766 
2767 bool
virPCIDeviceHasVPD(virPCIDevice * dev G_GNUC_UNUSED)2768 virPCIDeviceHasVPD(virPCIDevice *dev G_GNUC_UNUSED)
2769 {
2770     virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2771     return NULL;
2772 }
2773 
2774 virPCIVPDResource *
virPCIDeviceGetVPD(virPCIDevice * dev G_GNUC_UNUSED)2775 virPCIDeviceGetVPD(virPCIDevice *dev G_GNUC_UNUSED)
2776 {
2777     virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
2778     return NULL;
2779 }
2780 #endif /* __linux__ */
2781 
2782 int
virPCIDeviceIsPCIExpress(virPCIDevice * dev)2783 virPCIDeviceIsPCIExpress(virPCIDevice *dev)
2784 {
2785     int fd;
2786     int ret = -1;
2787 
2788     if ((fd = virPCIDeviceConfigOpen(dev)) < 0)
2789         return ret;
2790 
2791     if (virPCIDeviceInit(dev, fd) < 0)
2792         goto cleanup;
2793 
2794     ret = dev->is_pcie;
2795 
2796  cleanup:
2797     virPCIDeviceConfigClose(dev, fd);
2798     return ret;
2799 }
2800 
2801 int
virPCIDeviceHasPCIExpressLink(virPCIDevice * dev)2802 virPCIDeviceHasPCIExpressLink(virPCIDevice *dev)
2803 {
2804     int fd;
2805     int ret = -1;
2806     uint16_t cap, type;
2807 
2808     if ((fd = virPCIDeviceConfigOpen(dev)) < 0)
2809         return ret;
2810 
2811     if (virPCIDeviceInit(dev, fd) < 0)
2812         goto cleanup;
2813 
2814     if (dev->pcie_cap_pos == 0) {
2815         ret = 0;
2816         goto cleanup;
2817     }
2818 
2819     cap = virPCIDeviceRead16(dev, fd, dev->pcie_cap_pos + PCI_CAP_FLAGS);
2820     type = (cap & PCI_EXP_FLAGS_TYPE) >> 4;
2821 
2822     ret = type != PCI_EXP_TYPE_ROOT_INT_EP && type != PCI_EXP_TYPE_ROOT_EC;
2823 
2824  cleanup:
2825     virPCIDeviceConfigClose(dev, fd);
2826     return ret;
2827 }
2828 
2829 int
virPCIDeviceGetLinkCapSta(virPCIDevice * dev,int * cap_port,unsigned int * cap_speed,unsigned int * cap_width,unsigned int * sta_speed,unsigned int * sta_width)2830 virPCIDeviceGetLinkCapSta(virPCIDevice *dev,
2831                           int *cap_port,
2832                           unsigned int *cap_speed,
2833                           unsigned int *cap_width,
2834                           unsigned int *sta_speed,
2835                           unsigned int *sta_width)
2836 {
2837     uint32_t t;
2838     int fd;
2839     int ret = -1;
2840 
2841     if ((fd = virPCIDeviceConfigOpen(dev)) < 0)
2842         return ret;
2843 
2844     if (virPCIDeviceInit(dev, fd) < 0)
2845         goto cleanup;
2846 
2847     if (!dev->pcie_cap_pos) {
2848         virReportError(VIR_ERR_INTERNAL_ERROR,
2849                        _("pci device %s is not a PCI-Express device"),
2850                        dev->name);
2851         goto cleanup;
2852     }
2853 
2854     t = virPCIDeviceRead32(dev, fd, dev->pcie_cap_pos + PCI_EXP_LNKCAP);
2855 
2856     *cap_port = t >> 24;
2857     *cap_speed = t & PCI_EXP_LNKCAP_SPEED;
2858     *cap_width = (t & PCI_EXP_LNKCAP_WIDTH) >> 4;
2859 
2860     t = virPCIDeviceRead16(dev, fd, dev->pcie_cap_pos + PCI_EXP_LNKSTA);
2861 
2862     *sta_speed = t & PCI_EXP_LNKSTA_SPEED;
2863     *sta_width = (t & PCI_EXP_LNKSTA_WIDTH) >> 4;
2864     ret = 0;
2865 
2866  cleanup:
2867     virPCIDeviceConfigClose(dev, fd);
2868     return ret;
2869 }
2870 
2871 
virPCIGetHeaderType(virPCIDevice * dev,int * hdrType)2872 int virPCIGetHeaderType(virPCIDevice *dev, int *hdrType)
2873 {
2874     int fd;
2875     uint8_t type;
2876 
2877     *hdrType = -1;
2878 
2879     if ((fd = virPCIDeviceConfigOpen(dev)) < 0)
2880         return -1;
2881 
2882     type = virPCIDeviceRead8(dev, fd, PCI_HEADER_TYPE);
2883 
2884     virPCIDeviceConfigClose(dev, fd);
2885 
2886     type &= PCI_HEADER_TYPE_MASK;
2887     if (type >= VIR_PCI_HEADER_LAST) {
2888         virReportError(VIR_ERR_INTERNAL_ERROR,
2889                        _("Unknown PCI header type '%d' for device '%s'"),
2890                        type, dev->name);
2891         return -1;
2892     }
2893 
2894     *hdrType = type;
2895 
2896     return 0;
2897 }
2898 
2899 
2900 void
virPCIEDeviceInfoFree(virPCIEDeviceInfo * dev)2901 virPCIEDeviceInfoFree(virPCIEDeviceInfo *dev)
2902 {
2903     if (!dev)
2904         return;
2905 
2906     g_free(dev->link_cap);
2907     g_free(dev->link_sta);
2908     g_free(dev);
2909 }
2910 
2911 void
virPCIDeviceAddressFree(virPCIDeviceAddress * address)2912 virPCIDeviceAddressFree(virPCIDeviceAddress *address)
2913 {
2914     g_free(address);
2915 }
2916