1 /**
2 * collectd - src/pcie_errors.c
3 *
4 * Copyright(c) 2018 Intel Corporation. All rights reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors:
25 * Kamil Wiatrowski <kamilx.wiatrowski@intel.com>
26 **/
27
28 #include "collectd.h"
29
30 #include "utils/common/common.h"
31 #include "utils_llist.h"
32
33 #include <linux/pci_regs.h>
34
35 #define PCIE_ERRORS_PLUGIN "pcie_errors"
36 #define PCIE_DEFAULT_PROCDIR "/proc/bus/pci"
37 #define PCIE_DEFAULT_SYSFSDIR "/sys/bus/pci"
38 #define PCIE_NAME_LEN 512
39 #define PCIE_BUFF_SIZE 1024
40
41 #define PCIE_ERROR "pcie_error"
42 #define PCIE_SEV_CE "correctable"
43 #define PCIE_SEV_FATAL "fatal"
44 #define PCIE_SEV_NOFATAL "non_fatal"
45
46 #define PCIE_DEV(x) (((x) >> 3) & 0x1f)
47 #define PCIE_FN(x) ((x)&0x07)
48
49 #define PCIE_ECAP_OFFSET 0x100 /* ECAP always begin at offset 0x100 */
50
51 typedef struct pcie_config_s {
52 bool use_sysfs;
53 bool notif_masked;
54 bool persistent;
55 char access_dir[PATH_MAX];
56 } pcie_config_t;
57
58 typedef struct pcie_device_s {
59 int fd;
60 int domain;
61 uint8_t bus;
62 uint8_t device;
63 uint8_t function;
64 int cap_exp;
65 int ecap_aer;
66 uint16_t device_status;
67 uint32_t correctable_errors;
68 uint32_t uncorrectable_errors;
69 } pcie_device_t;
70
71 typedef struct pcie_fops_s {
72 int (*list_devices)(llist_t *dev_list);
73 int (*open)(pcie_device_t *dev);
74 void (*close)(pcie_device_t *dev);
75 int (*read)(pcie_device_t *dev, void *buff, int size, int pos);
76 } pcie_fops_t;
77
78 typedef struct pcie_error_s {
79 int mask;
80 const char *desc;
81 } pcie_error_t;
82
83 static llist_t *pcie_dev_list;
84 static pcie_config_t pcie_config = {.access_dir = "", .use_sysfs = true};
85 static pcie_fops_t pcie_fops;
86
87 /* Device Error Status */
88 static const pcie_error_t pcie_base_errors[] = {
89 {PCI_EXP_DEVSTA_CED, "Correctable Error"},
90 {PCI_EXP_DEVSTA_NFED, "Non-Fatal Error"},
91 {PCI_EXP_DEVSTA_FED, "Fatal Error"},
92 {PCI_EXP_DEVSTA_URD, "Unsupported Request"}};
93 static const int pcie_base_errors_num = STATIC_ARRAY_SIZE(pcie_base_errors);
94
95 /* Uncorrectable Error Status */
96 static const pcie_error_t pcie_aer_ues[] = {
97 #ifdef PCI_ERR_UNC_DLP
98 {PCI_ERR_UNC_DLP, "Data Link Protocol"},
99 #endif
100 #ifdef PCI_ERR_UNC_SURPDN
101 {PCI_ERR_UNC_SURPDN, "Surprise Down"},
102 #endif
103 #ifdef PCI_ERR_UNC_POISON_TLP
104 {PCI_ERR_UNC_POISON_TLP, "Poisoned TLP"},
105 #endif
106 #ifdef PCI_ERR_UNC_FCP
107 {PCI_ERR_UNC_FCP, "Flow Control Protocol"},
108 #endif
109 #ifdef PCI_ERR_UNC_COMP_TIME
110 {PCI_ERR_UNC_COMP_TIME, "Completion Timeout"},
111 #endif
112 #ifdef PCI_ERR_UNC_COMP_ABORT
113 {PCI_ERR_UNC_COMP_ABORT, "Completer Abort"},
114 #endif
115 #ifdef PCI_ERR_UNC_UNX_COMP
116 {PCI_ERR_UNC_UNX_COMP, "Unexpected Completion"},
117 #endif
118 #ifdef PCI_ERR_UNC_RX_OVER
119 {PCI_ERR_UNC_RX_OVER, "Receiver Overflow"},
120 #endif
121 #ifdef PCI_ERR_UNC_MALF_TLP
122 {PCI_ERR_UNC_MALF_TLP, "Malformed TLP"},
123 #endif
124 #ifdef PCI_ERR_UNC_ECRC
125 {PCI_ERR_UNC_ECRC, "ECRC Error Status"},
126 #endif
127 #ifdef PCI_ERR_UNC_UNSUP
128 {PCI_ERR_UNC_UNSUP, "Unsupported Request"},
129 #endif
130 #ifdef PCI_ERR_UNC_ACSV
131 {PCI_ERR_UNC_ACSV, "ACS Violation"},
132 #endif
133 #ifdef PCI_ERR_UNC_INTN
134 {PCI_ERR_UNC_INTN, "Internal"},
135 #endif
136 #ifdef PCI_ERR_UNC_MCBTLP
137 {PCI_ERR_UNC_MCBTLP, "MC blocked TLP"},
138 #endif
139 #ifdef PCI_ERR_UNC_ATOMEG
140 {PCI_ERR_UNC_ATOMEG, "Atomic egress blocked"},
141 #endif
142 #ifdef PCI_ERR_UNC_TLPPRE
143 {PCI_ERR_UNC_TLPPRE, "TLP prefix blocked"},
144 #endif
145 };
146 static const int pcie_aer_ues_num = STATIC_ARRAY_SIZE(pcie_aer_ues);
147
148 /* Correctable Error Status */
149 static const pcie_error_t pcie_aer_ces[] = {
150 #ifdef PCI_ERR_COR_RCVR
151 {PCI_ERR_COR_RCVR, "Receiver Error Status"},
152 #endif
153 #ifdef PCI_ERR_COR_BAD_TLP
154 {PCI_ERR_COR_BAD_TLP, "Bad TLP Status"},
155 #endif
156 #ifdef PCI_ERR_COR_BAD_DLLP
157 {PCI_ERR_COR_BAD_DLLP, "Bad DLLP Status"},
158 #endif
159 #ifdef PCI_ERR_COR_REP_ROLL
160 {PCI_ERR_COR_REP_ROLL, "REPLAY_NUM Rollover"},
161 #endif
162 #ifdef PCI_ERR_COR_REP_TIMER
163 {PCI_ERR_COR_REP_TIMER, "Replay Timer Timeout"},
164 #endif
165 #ifdef PCI_ERR_COR_ADV_NFAT
166 {PCI_ERR_COR_ADV_NFAT, "Advisory Non-Fatal"},
167 #endif
168 #ifdef PCI_ERR_COR_INTERNAL
169 {PCI_ERR_COR_INTERNAL, "Corrected Internal"},
170 #endif
171 #ifdef PCI_ERR_COR_LOG_OVER
172 {PCI_ERR_COR_LOG_OVER, "Header Log Overflow"},
173 #endif
174 };
175 static const int pcie_aer_ces_num = STATIC_ARRAY_SIZE(pcie_aer_ces);
176
pcie_add_device(llist_t * list,int domain,uint8_t bus,uint8_t device,uint8_t fn)177 static int pcie_add_device(llist_t *list, int domain, uint8_t bus,
178 uint8_t device, uint8_t fn) {
179 llentry_t *entry;
180 pcie_device_t *dev = calloc(1, sizeof(*dev));
181 if (dev == NULL) {
182 ERROR(PCIE_ERRORS_PLUGIN ": Failed to allocate device");
183 return -ENOMEM;
184 }
185
186 dev->domain = domain;
187 dev->bus = bus;
188 dev->device = device;
189 dev->function = fn;
190 dev->cap_exp = -1;
191 dev->ecap_aer = -1;
192 entry = llentry_create(NULL, dev);
193 if (entry == NULL) {
194 ERROR(PCIE_ERRORS_PLUGIN ": Failed to create llentry");
195 sfree(dev);
196 return -ENOMEM;
197 }
198 llist_append(list, entry);
199
200 DEBUG(PCIE_ERRORS_PLUGIN ": pci device added to list: %04x:%02x:%02x.%d",
201 domain, bus, device, fn);
202 return 0;
203 }
204
pcie_clear_list(llist_t * list)205 static void pcie_clear_list(llist_t *list) {
206 if (list == NULL)
207 return;
208
209 for (llentry_t *e = llist_head(list); e != NULL; e = e->next)
210 sfree(e->value);
211
212 llist_destroy(list);
213 }
214
pcie_list_devices_proc(llist_t * dev_list)215 static int pcie_list_devices_proc(llist_t *dev_list) {
216 FILE *fd;
217 char file_name[PCIE_NAME_LEN];
218 char buf[PCIE_BUFF_SIZE];
219 unsigned int i = 0;
220 int ret = 0;
221
222 if (dev_list == NULL)
223 return -EINVAL;
224
225 ret = snprintf(file_name, sizeof(file_name), "%s/devices",
226 pcie_config.access_dir);
227 if (ret < 1 || (size_t)ret >= sizeof(file_name)) {
228 ERROR(PCIE_ERRORS_PLUGIN ": Access dir `%s' is too long (%d)",
229 pcie_config.access_dir, ret);
230 return -EINVAL;
231 }
232 fd = fopen(file_name, "r");
233 if (!fd) {
234 char errbuf[PCIE_BUFF_SIZE];
235 ERROR(PCIE_ERRORS_PLUGIN ": Cannot open file %s to get devices list: %s",
236 file_name, sstrerror(errno, errbuf, sizeof(errbuf)));
237 return -ENOENT;
238 }
239
240 while (fgets(buf, sizeof(buf), fd)) {
241 unsigned int slot;
242
243 if (sscanf(buf, "%x", &slot) != 1) {
244 ERROR(PCIE_ERRORS_PLUGIN ": Failed to read line %u from %s", i + 1,
245 file_name);
246 continue;
247 }
248
249 uint8_t bus = slot >> 8U;
250 uint8_t dev = PCIE_DEV(slot);
251 uint8_t fn = PCIE_FN(slot);
252 ret = pcie_add_device(dev_list, 0, bus, dev, fn);
253 if (ret)
254 break;
255
256 ++i;
257 }
258
259 fclose(fd);
260 return ret;
261 }
262
pcie_list_devices_sysfs(llist_t * dev_list)263 static int pcie_list_devices_sysfs(llist_t *dev_list) {
264 DIR *dir;
265 struct dirent *item;
266 char dir_name[PCIE_NAME_LEN];
267 int ret = 0;
268
269 if (dev_list == NULL)
270 return -EINVAL;
271
272 ret = snprintf(dir_name, sizeof(dir_name), "%s/devices",
273 pcie_config.access_dir);
274 if (ret < 1 || (size_t)ret >= sizeof(dir_name)) {
275 ERROR(PCIE_ERRORS_PLUGIN ": Access dir `%s' is too long (%d)",
276 pcie_config.access_dir, ret);
277 return -EINVAL;
278 }
279 dir = opendir(dir_name);
280 if (!dir) {
281 char errbuf[PCIE_BUFF_SIZE];
282 ERROR(PCIE_ERRORS_PLUGIN ": Cannot open dir %s to get devices list: %s",
283 dir_name, sstrerror(errno, errbuf, sizeof(errbuf)));
284 return -ENOENT;
285 }
286
287 while ((item = readdir(dir))) {
288 unsigned int dom, bus, dev;
289 int fn;
290
291 /* Omit special non-device entries */
292 if (item->d_name[0] == '.')
293 continue;
294
295 if (sscanf(item->d_name, "%x:%x:%x.%d", &dom, &bus, &dev, &fn) != 4) {
296 ERROR(PCIE_ERRORS_PLUGIN ": Failed to parse entry %s", item->d_name);
297 continue;
298 }
299
300 ret = pcie_add_device(dev_list, dom, bus, dev, fn);
301 if (ret)
302 break;
303 }
304
305 closedir(dir);
306 return ret;
307 }
308
pcie_close(pcie_device_t * dev)309 static void pcie_close(pcie_device_t *dev) {
310 if (close(dev->fd) == -1) {
311 char errbuf[PCIE_BUFF_SIZE];
312 ERROR(PCIE_ERRORS_PLUGIN ": Failed to close %04x:%02x:%02x.%d, fd=%d: %s",
313 dev->domain, dev->bus, dev->device, dev->function, dev->fd,
314 sstrerror(errno, errbuf, sizeof(errbuf)));
315 }
316
317 dev->fd = -1;
318 }
319
pcie_open(pcie_device_t * dev,const char * name)320 static int pcie_open(pcie_device_t *dev, const char *name) {
321 dev->fd = open(name, O_RDONLY);
322 if (dev->fd == -1) {
323 char errbuf[PCIE_BUFF_SIZE];
324 ERROR(PCIE_ERRORS_PLUGIN ": Failed to open file %s: %s", name,
325 sstrerror(errno, errbuf, sizeof(errbuf)));
326 return -ENOENT;
327 }
328
329 return 0;
330 }
331
pcie_open_proc(pcie_device_t * dev)332 static int pcie_open_proc(pcie_device_t *dev) {
333 char file_name[PCIE_NAME_LEN];
334
335 int ret =
336 snprintf(file_name, sizeof(file_name), "%s/%02x/%02x.%d",
337 pcie_config.access_dir, dev->bus, dev->device, dev->function);
338 if (ret < 1 || (size_t)ret >= sizeof(file_name)) {
339 ERROR(PCIE_ERRORS_PLUGIN ": Access dir `%s' is too long (%d)",
340 pcie_config.access_dir, ret);
341 return -EINVAL;
342 }
343
344 return pcie_open(dev, file_name);
345 }
346
pcie_open_sysfs(pcie_device_t * dev)347 static int pcie_open_sysfs(pcie_device_t *dev) {
348 char file_name[PCIE_NAME_LEN];
349
350 int ret =
351 snprintf(file_name, sizeof(file_name),
352 "%s/devices/%04x:%02x:%02x.%d/config", pcie_config.access_dir,
353 dev->domain, dev->bus, dev->device, dev->function);
354 if (ret < 1 || (size_t)ret >= sizeof(file_name)) {
355 ERROR(PCIE_ERRORS_PLUGIN ": Access dir `%s' is too long (%d)",
356 pcie_config.access_dir, ret);
357 return -EINVAL;
358 }
359
360 return pcie_open(dev, file_name);
361 }
362
pcie_read(pcie_device_t * dev,void * buff,int size,int pos)363 static int pcie_read(pcie_device_t *dev, void *buff, int size, int pos) {
364 int len = pread(dev->fd, buff, size, pos);
365 if (len == size)
366 return 0;
367
368 if (len == -1) {
369 char errbuf[PCIE_BUFF_SIZE];
370 ERROR(PCIE_ERRORS_PLUGIN ": Failed to read %04x:%02x:%02x.%d at pos %d: %s",
371 dev->domain, dev->bus, dev->device, dev->function, pos,
372 sstrerror(errno, errbuf, sizeof(errbuf)));
373 } else {
374 ERROR(PCIE_ERRORS_PLUGIN
375 ": %04x:%02x:%02x.%d Read only %d bytes, should be %d",
376 dev->domain, dev->bus, dev->device, dev->function, len, size);
377 }
378 return -1;
379 }
380
pcie_read8(pcie_device_t * dev,int pos)381 static uint8_t pcie_read8(pcie_device_t *dev, int pos) {
382 uint8_t value;
383 if (pcie_fops.read(dev, &value, 1, pos))
384 return 0;
385 return value;
386 }
387
pcie_read16(pcie_device_t * dev,int pos)388 static uint16_t pcie_read16(pcie_device_t *dev, int pos) {
389 uint16_t value;
390 if (pcie_fops.read(dev, &value, 2, pos))
391 return 0;
392 return value;
393 }
394
pcie_read32(pcie_device_t * dev,int pos)395 static uint32_t pcie_read32(pcie_device_t *dev, int pos) {
396 uint32_t value;
397 if (pcie_fops.read(dev, &value, 4, pos))
398 return 0;
399 return value;
400 }
401
pcie_dispatch_notification(pcie_device_t * dev,notification_t * n,const char * type,const char * type_instance)402 static void pcie_dispatch_notification(pcie_device_t *dev, notification_t *n,
403 const char *type,
404 const char *type_instance) {
405 sstrncpy(n->host, hostname_g, sizeof(n->host));
406 snprintf(n->plugin_instance, sizeof(n->plugin_instance), "%04x:%02x:%02x.%d",
407 dev->domain, dev->bus, dev->device, dev->function);
408 sstrncpy(n->type, type, sizeof(n->type));
409 sstrncpy(n->type_instance, type_instance, sizeof(n->type_instance));
410
411 plugin_dispatch_notification(n);
412 }
413
414 /* Report errors found in AER Correctable Error Status register */
pcie_dispatch_correctable_errors(pcie_device_t * dev,uint32_t errors,uint32_t masked)415 static void pcie_dispatch_correctable_errors(pcie_device_t *dev,
416 uint32_t errors, uint32_t masked) {
417 for (int i = 0; i < pcie_aer_ces_num; i++) {
418 const pcie_error_t *err = pcie_aer_ces + i;
419 notification_t n = {.severity = NOTIF_WARNING,
420 .time = cdtime(),
421 .plugin = PCIE_ERRORS_PLUGIN,
422 .meta = NULL};
423
424 /* If not specifically set by config option omit masked errors */
425 if (!pcie_config.notif_masked && (err->mask & masked))
426 continue;
427
428 if (err->mask & errors) {
429 /* Error already reported, notify only if persistent is set */
430 if (!pcie_config.persistent && (err->mask & dev->correctable_errors))
431 continue;
432
433 DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s set", dev->domain,
434 dev->bus, dev->device, dev->function, err->desc);
435 snprintf(n.message, sizeof(n.message), "Correctable Error set: %s",
436 err->desc);
437 pcie_dispatch_notification(dev, &n, PCIE_ERROR, PCIE_SEV_CE);
438
439 } else if (err->mask & dev->correctable_errors) {
440 DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s cleared", dev->domain,
441 dev->bus, dev->device, dev->function, err->desc);
442
443 n.severity = NOTIF_OKAY;
444 snprintf(n.message, sizeof(n.message), "Correctable Error cleared: %s",
445 err->desc);
446 pcie_dispatch_notification(dev, &n, PCIE_ERROR, PCIE_SEV_CE);
447 }
448 }
449 }
450
451 /* Report errors found in AER Uncorrectable Error Status register */
pcie_dispatch_uncorrectable_errors(pcie_device_t * dev,uint32_t errors,uint32_t masked,uint32_t severity)452 static void pcie_dispatch_uncorrectable_errors(pcie_device_t *dev,
453 uint32_t errors, uint32_t masked,
454 uint32_t severity) {
455 for (int i = 0; i < pcie_aer_ues_num; i++) {
456 const pcie_error_t *err = pcie_aer_ues + i;
457 const char *type_instance =
458 (severity & err->mask) ? PCIE_SEV_FATAL : PCIE_SEV_NOFATAL;
459 notification_t n = {
460 .time = cdtime(), .plugin = PCIE_ERRORS_PLUGIN, .meta = NULL};
461
462 /* If not specifically set by config option omit masked errors */
463 if (!pcie_config.notif_masked && (err->mask & masked))
464 continue;
465
466 if (err->mask & errors) {
467 /* Error already reported, notify only if persistent is set */
468 if (!pcie_config.persistent && (err->mask & dev->uncorrectable_errors))
469 continue;
470
471 DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s(%s) set", dev->domain,
472 dev->bus, dev->device, dev->function, err->desc, type_instance);
473
474 n.severity = (severity & err->mask) ? NOTIF_FAILURE : NOTIF_WARNING;
475 snprintf(n.message, sizeof(n.message), "Uncorrectable(%s) Error set: %s",
476 type_instance, err->desc);
477 pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance);
478
479 } else if (err->mask & dev->uncorrectable_errors) {
480 DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s(%s) cleared",
481 dev->domain, dev->bus, dev->device, dev->function, err->desc,
482 type_instance);
483
484 n.severity = NOTIF_OKAY;
485 snprintf(n.message, sizeof(n.message),
486 "Uncorrectable(%s) Error cleared: %s", type_instance, err->desc);
487 pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance);
488 }
489 }
490 }
491
492 /* Find offset of PCI Express Capability Structure
493 * in PCI configuration space.
494 * Returns offset, -1 if not found.
495 **/
pcie_find_cap_exp(pcie_device_t * dev)496 static int pcie_find_cap_exp(pcie_device_t *dev) {
497 int pos = pcie_read8(dev, PCI_CAPABILITY_LIST) & ~3;
498
499 while (pos) {
500 uint8_t id = pcie_read8(dev, pos + PCI_CAP_LIST_ID);
501
502 if (id == 0xff)
503 break;
504 if (id == PCI_CAP_ID_EXP)
505 return pos;
506
507 pos = pcie_read8(dev, pos + PCI_CAP_LIST_NEXT) & ~3;
508 }
509
510 DEBUG(PCIE_ERRORS_PLUGIN ": Cannot find CAP EXP for %04x:%02x:%02x.%d",
511 dev->domain, dev->bus, dev->device, dev->function);
512
513 return -1;
514 }
515
516 /* Find offset of Advanced Error Reporting Capability.
517 * Returns AER offset, -1 if not found.
518 **/
pcie_find_ecap_aer(pcie_device_t * dev)519 static int pcie_find_ecap_aer(pcie_device_t *dev) {
520 int pos = PCIE_ECAP_OFFSET;
521 uint32_t header = pcie_read32(dev, pos);
522 int id = PCI_EXT_CAP_ID(header);
523 int next = PCI_EXT_CAP_NEXT(header);
524
525 if (!id && !next)
526 return -1;
527
528 if (id == PCI_EXT_CAP_ID_ERR)
529 return pos;
530
531 while (next) {
532 if (next <= PCIE_ECAP_OFFSET)
533 break;
534
535 header = pcie_read32(dev, next);
536 id = PCI_EXT_CAP_ID(header);
537
538 if (id == PCI_EXT_CAP_ID_ERR)
539 return next;
540
541 next = PCI_EXT_CAP_NEXT(header);
542 }
543
544 return -1;
545 }
546
pcie_check_dev_status(pcie_device_t * dev,int pos)547 static void pcie_check_dev_status(pcie_device_t *dev, int pos) {
548 /* Read Device Status register with mask for errors only */
549 uint16_t new_status = pcie_read16(dev, pos + PCI_EXP_DEVSTA) & 0xf;
550
551 /* Check if anything new should be reported */
552 if (!(pcie_config.persistent && new_status) &&
553 (new_status == dev->device_status))
554 return;
555
556 /* Report errors found in Device Status register */
557 for (int i = 0; i < pcie_base_errors_num; i++) {
558 const pcie_error_t *err = pcie_base_errors + i;
559 const char *type_instance = (err->mask == PCI_EXP_DEVSTA_FED)
560 ? PCIE_SEV_FATAL
561 : (err->mask == PCI_EXP_DEVSTA_CED)
562 ? PCIE_SEV_CE
563 : PCIE_SEV_NOFATAL;
564 int severity =
565 (err->mask == PCI_EXP_DEVSTA_FED) ? NOTIF_FAILURE : NOTIF_WARNING;
566 notification_t n = {.severity = severity,
567 .time = cdtime(),
568 .plugin = PCIE_ERRORS_PLUGIN,
569 .meta = NULL};
570
571 if (err->mask & new_status) {
572 /* Error already reported, notify only if persistent is set */
573 if (!pcie_config.persistent && (err->mask & dev->device_status))
574 continue;
575
576 DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s set", dev->domain,
577 dev->bus, dev->device, dev->function, err->desc);
578 snprintf(n.message, sizeof(n.message), "Device Status Error set: %s",
579 err->desc);
580 pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance);
581
582 } else if (err->mask & dev->device_status) {
583 DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s cleared", dev->domain,
584 dev->bus, dev->device, dev->function, err->desc);
585 n.severity = NOTIF_OKAY;
586 snprintf(n.message, sizeof(n.message), "Device Status Error cleared: %s",
587 err->desc);
588 pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance);
589 }
590 }
591
592 dev->device_status = new_status;
593 }
594
pcie_check_aer(pcie_device_t * dev,int pos)595 static void pcie_check_aer(pcie_device_t *dev, int pos) {
596 /* Check for AER uncorrectable errors */
597 uint32_t errors = pcie_read32(dev, pos + PCI_ERR_UNCOR_STATUS);
598
599 if ((pcie_config.persistent && errors) ||
600 (errors != dev->uncorrectable_errors)) {
601 uint32_t masked = pcie_read32(dev, pos + PCI_ERR_UNCOR_MASK);
602 uint32_t severity = pcie_read32(dev, pos + PCI_ERR_UNCOR_SEVER);
603 pcie_dispatch_uncorrectable_errors(dev, errors, masked, severity);
604 }
605 dev->uncorrectable_errors = errors;
606
607 /* Check for AER correctable errors */
608 errors = pcie_read32(dev, pos + PCI_ERR_COR_STATUS);
609 if ((pcie_config.persistent && errors) ||
610 (errors != dev->correctable_errors)) {
611 uint32_t masked = pcie_read32(dev, pos + PCI_ERR_COR_MASK);
612 pcie_dispatch_correctable_errors(dev, errors, masked);
613 }
614 dev->correctable_errors = errors;
615 }
616
pcie_process_devices(llist_t * devs)617 static int pcie_process_devices(llist_t *devs) {
618 int ret = 0;
619 if (devs == NULL)
620 return -1;
621
622 for (llentry_t *e = llist_head(devs); e != NULL; e = e->next) {
623 pcie_device_t *dev = e->value;
624
625 if (pcie_fops.open(dev) == 0) {
626 pcie_check_dev_status(dev, dev->cap_exp);
627 if (dev->ecap_aer != -1)
628 pcie_check_aer(dev, dev->ecap_aer);
629
630 pcie_fops.close(dev);
631 } else {
632 notification_t n = {.severity = NOTIF_FAILURE,
633 .time = cdtime(),
634 .message = "Failed to read device status",
635 .plugin = PCIE_ERRORS_PLUGIN,
636 .meta = NULL};
637 pcie_dispatch_notification(dev, &n, "", "");
638 ret = -1;
639 }
640 }
641
642 return ret;
643 }
644
645 /* This function is to be called during init to filter out no pcie devices */
pcie_preprocess_devices(llist_t * devs)646 static void pcie_preprocess_devices(llist_t *devs) {
647 llentry_t *e_next;
648
649 if (devs == NULL)
650 return;
651
652 for (llentry_t *e = llist_head(devs); e != NULL; e = e_next) {
653 pcie_device_t *dev = e->value;
654 bool del = false;
655
656 if (pcie_fops.open(dev) == 0) {
657 uint16_t status = pcie_read16(dev, PCI_STATUS);
658 if (status & PCI_STATUS_CAP_LIST)
659 dev->cap_exp = pcie_find_cap_exp(dev);
660
661 /* Every PCIe device must have Capability Structure */
662 if (dev->cap_exp == -1) {
663 DEBUG(PCIE_ERRORS_PLUGIN ": Not PCI Express device: %04x:%02x:%02x.%d",
664 dev->domain, dev->bus, dev->device, dev->function);
665 del = true;
666 } else {
667 dev->ecap_aer = pcie_find_ecap_aer(dev);
668 if (dev->ecap_aer == -1)
669 INFO(PCIE_ERRORS_PLUGIN
670 ": Device is not AER capable: %04x:%02x:%02x.%d",
671 dev->domain, dev->bus, dev->device, dev->function);
672 }
673
674 pcie_fops.close(dev);
675 } else {
676 ERROR(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: failed to open",
677 dev->domain, dev->bus, dev->device, dev->function);
678 del = true;
679 }
680
681 e_next = e->next;
682 if (del) {
683 sfree(dev);
684 llist_remove(devs, e);
685 llentry_destroy(e);
686 }
687 }
688 }
689
pcie_plugin_read(user_data_t * ud)690 static int pcie_plugin_read(__attribute__((unused)) user_data_t *ud) {
691
692 if (pcie_process_devices(pcie_dev_list) < 0) {
693 ERROR(PCIE_ERRORS_PLUGIN ": Failed to read devices state");
694 return -1;
695 }
696 return 0;
697 }
698
pcie_access_config(void)699 static void pcie_access_config(void) {
700 /* Set functions for register access to
701 * use proc or sysfs depending on config. */
702 if (pcie_config.use_sysfs) {
703 pcie_fops.list_devices = pcie_list_devices_sysfs;
704 pcie_fops.open = pcie_open_sysfs;
705 if (pcie_config.access_dir[0] == '\0')
706 sstrncpy(pcie_config.access_dir, PCIE_DEFAULT_SYSFSDIR,
707 sizeof(pcie_config.access_dir));
708 } else {
709 /* use proc */
710 pcie_fops.list_devices = pcie_list_devices_proc;
711 pcie_fops.open = pcie_open_proc;
712 if (pcie_config.access_dir[0] == '\0')
713 sstrncpy(pcie_config.access_dir, PCIE_DEFAULT_PROCDIR,
714 sizeof(pcie_config.access_dir));
715 }
716 /* Common functions */
717 pcie_fops.close = pcie_close;
718 pcie_fops.read = pcie_read;
719 }
720
pcie_plugin_config(oconfig_item_t * ci)721 static int pcie_plugin_config(oconfig_item_t *ci) {
722 int status = 0;
723
724 for (int i = 0; i < ci->children_num; i++) {
725 oconfig_item_t *child = ci->children + i;
726
727 if (strcasecmp("Source", child->key) == 0) {
728 if ((child->values_num != 1) ||
729 (child->values[0].type != OCONFIG_TYPE_STRING)) {
730 status = -1;
731 } else if (strcasecmp("proc", child->values[0].value.string) == 0) {
732 pcie_config.use_sysfs = false;
733 } else if (strcasecmp("sysfs", child->values[0].value.string) != 0) {
734 ERROR(PCIE_ERRORS_PLUGIN ": Allowed sources are 'proc' or 'sysfs'.");
735 status = -1;
736 }
737 } else if (strcasecmp("AccessDir", child->key) == 0) {
738 status = cf_util_get_string_buffer(child, pcie_config.access_dir,
739 sizeof(pcie_config.access_dir));
740 } else if (strcasecmp("ReportMasked", child->key) == 0) {
741 status = cf_util_get_boolean(child, &pcie_config.notif_masked);
742 } else if (strcasecmp("PersistentNotifications", child->key) == 0) {
743 status = cf_util_get_boolean(child, &pcie_config.persistent);
744 } else {
745 ERROR(PCIE_ERRORS_PLUGIN ": Invalid configuration option \"%s\".",
746 child->key);
747 status = -1;
748 break;
749 }
750
751 if (status) {
752 ERROR(PCIE_ERRORS_PLUGIN ": Invalid configuration parameter \"%s\".",
753 child->key);
754 break;
755 }
756 }
757
758 return status;
759 }
760
pcie_shutdown(void)761 static int pcie_shutdown(void) {
762 pcie_clear_list(pcie_dev_list);
763 pcie_dev_list = NULL;
764
765 return 0;
766 }
767
pcie_init(void)768 static int pcie_init(void) {
769
770 pcie_access_config();
771 pcie_dev_list = llist_create();
772 if (pcie_fops.list_devices(pcie_dev_list) != 0) {
773 ERROR(PCIE_ERRORS_PLUGIN ": Failed to find devices.");
774 pcie_shutdown();
775 return -1;
776 }
777 pcie_preprocess_devices(pcie_dev_list);
778 if (llist_size(pcie_dev_list) == 0) {
779 /* No any PCI Express devices were found on the system */
780 ERROR(PCIE_ERRORS_PLUGIN ": No PCIe devices found in %s",
781 pcie_config.access_dir);
782 pcie_shutdown();
783 return -1;
784 }
785
786 return 0;
787 }
788
module_register(void)789 void module_register(void) {
790 plugin_register_init(PCIE_ERRORS_PLUGIN, pcie_init);
791 plugin_register_complex_config(PCIE_ERRORS_PLUGIN, pcie_plugin_config);
792 plugin_register_complex_read(NULL, PCIE_ERRORS_PLUGIN, pcie_plugin_read, 0,
793 NULL);
794 plugin_register_shutdown(PCIE_ERRORS_PLUGIN, pcie_shutdown);
795 }
796