1 /*
2 * Copyright (c) 2011 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Sepherosa Ziehau <sepherosa@gmail.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/bus.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/bitops.h>
41 #include <sys/sensors.h>
42
43 #include <bus/pci/pcivar.h>
44 #include <bus/pci/pcireg.h>
45 #include <bus/pci/pci_cfgreg.h>
46
47 #include <vm/pmap.h>
48
49 #include "coremctl_if.h"
50 #include "pcib_if.h"
51
52 #include <dev/misc/dimm/dimm.h>
53 #include <dev/misc/coremctl/coremctl_reg.h>
54
55 #define ECC_E3_VER_1 1 /* Sandy Bridge */
56 #define ECC_E3_VER_2 2 /* Ivy Bridge */
57 #define ECC_E3_VER_3 3 /* Haswell */
58
59 #define ECC_E3_THRESH_DEFAULT 5
60
61 #define ECC_E3_CHAN_MAX 2
62 #define ECC_E3_CHAN_DIMM_MAX 2
63 #define ECC_E3_DIMM_RANK_MAX 2
64 #define ECC_E3_CHAN_RANK_MAX (ECC_E3_CHAN_DIMM_MAX * ECC_E3_DIMM_RANK_MAX)
65
66 struct ecc_e3_type {
67 uint16_t did;
68 const char *desc;
69 int ver; /* ECC_E3_VER_ */
70 };
71
72 struct ecc_e3_dimm {
73 TAILQ_ENTRY(ecc_e3_dimm) dimm_link;
74 struct dimm_softc *dimm_softc;
75 struct ksensor dimm_sensor;
76 };
77
78 struct ecc_e3_rank {
79 struct ecc_e3_dimm *rank_dimm_sc;
80 };
81
82 struct ecc_e3_chan {
83 int chan_id;
84 int chan_errlog0;
85 int chan_rank_cnt;
86 struct ecc_e3_rank chan_rank[ECC_E3_CHAN_RANK_MAX];
87 };
88
89 struct ecc_e3_softc {
90 device_t ecc_dev;
91 device_t ecc_parent; /* non-NULL if parent has MCHBAR */
92 int ecc_ver; /* ECC_E3_VER_ */
93 uint32_t ecc_flags; /* ECC_E3_FLAG_ */
94
95 struct ecc_e3_chan ecc_chan[ECC_E3_CHAN_MAX];
96 TAILQ_HEAD(, ecc_e3_dimm) ecc_dimm;
97
98 /*
99 * If the parent does not have MCHBAR,
100 * i.e. no DIMM location information
101 * for the ECC errors, fallback to the
102 * sensor and counters below.
103 */
104 struct ksensordev ecc_sensdev;
105 struct ksensor ecc_sens;
106 int ecc_count;
107 int ecc_thresh;
108 };
109
110 #define ECC_E3_FLAG_SENSTASK 0x1
111 #define ECC_E3_FLAG_CRIT 0x2
112
113 #define ecc_printf(sc, fmt, arg...) \
114 device_printf((sc)->ecc_dev, fmt , ##arg)
115
116 static int ecc_e3_probe(device_t);
117 static int ecc_e3_attach(device_t);
118 static int ecc_e3_detach(device_t);
119 static void ecc_e3_shutdown(device_t);
120
121 static void ecc_e3_attach_ch(struct ecc_e3_softc *, struct ecc_e3_chan *,
122 int, uint32_t, int);
123 static void ecc_e3_errlog(struct ecc_e3_softc *, boolean_t);
124 static void ecc_e3_errlog_ch(struct ecc_e3_softc *, struct ecc_e3_chan *,
125 boolean_t);
126 static void ecc_e3_stop(struct ecc_e3_softc *);
127
128 static void ecc_e3_sensor_task(void *);
129 static void ecc_e3_sensor_update(struct ecc_e3_softc *, boolean_t);
130
131 static const struct ecc_e3_type ecc_e3_types[] = {
132 { PCI_E3V1_MEMCTL_DID, "Intel E3 ECC", ECC_E3_VER_1 },
133 { PCI_E3V2_MEMCTL_DID, "Intel E3 v2 ECC", ECC_E3_VER_2 },
134 { PCI_E3V3_MEMCTL_DID, "Intel E3 v3 ECC", ECC_E3_VER_3 },
135 { 0, NULL, 0 } /* required last entry */
136 };
137
138 static device_method_t ecc_e3_methods[] = {
139 /* Device interface */
140 DEVMETHOD(device_probe, ecc_e3_probe),
141 DEVMETHOD(device_attach, ecc_e3_attach),
142 DEVMETHOD(device_detach, ecc_e3_detach),
143 DEVMETHOD(device_shutdown, ecc_e3_shutdown),
144 DEVMETHOD(device_suspend, bus_generic_suspend),
145 DEVMETHOD(device_resume, bus_generic_resume),
146 DEVMETHOD_END
147 };
148
149 static driver_t ecc_e3_driver = {
150 "ecc",
151 ecc_e3_methods,
152 sizeof(struct ecc_e3_softc)
153 };
154 static devclass_t ecc_devclass;
155 DRIVER_MODULE(ecc_e3, coremctl, ecc_e3_driver, ecc_devclass, NULL, NULL);
156 MODULE_DEPEND(ecc_e3, pci, 1, 1, 1);
157 MODULE_DEPEND(ecc_e3, coremctl, 1, 1, 1);
158 MODULE_VERSION(ecc_e3, 1);
159
160 static __inline uint32_t
CSR_READ_4(struct ecc_e3_softc * sc,int ofs)161 CSR_READ_4(struct ecc_e3_softc *sc, int ofs)
162 {
163 uint32_t val;
164 int error;
165
166 error = COREMCTL_MCH_READ(sc->ecc_parent, ofs, &val);
167 KASSERT(!error, ("mch read failed"));
168
169 return val;
170 }
171
172 static int
ecc_e3_probe(device_t dev)173 ecc_e3_probe(device_t dev)
174 {
175 const struct ecc_e3_type *t;
176 uint16_t did;
177
178 if (pci_get_vendor(dev) != PCI_CORE_MEMCTL_VID)
179 return ENXIO;
180
181 did = pci_get_device(dev);
182 for (t = ecc_e3_types; t->desc != NULL; ++t) {
183 if (t->did == did) {
184 struct ecc_e3_softc *sc = device_get_softc(dev);
185
186 device_set_desc(dev, t->desc);
187 sc->ecc_ver = t->ver;
188 return 0;
189 }
190 }
191 return ENXIO;
192 }
193
194 static int
ecc_e3_attach(device_t dev)195 ecc_e3_attach(device_t dev)
196 {
197 struct ecc_e3_softc *sc = device_get_softc(dev);
198 uint32_t val;
199 int error;
200
201 TAILQ_INIT(&sc->ecc_dimm);
202 sc->ecc_dev = dev;
203
204 /* Probe the existance of MCHBAR */
205 error = COREMCTL_MCH_READ(device_get_parent(dev), MCH_CORE_DIMM_CH0,
206 &val);
207 if (!error)
208 sc->ecc_parent = device_get_parent(dev);
209
210 if (sc->ecc_parent != NULL) {
211 uint32_t dimm_ch0, dimm_ch1;
212 int ecc_active;
213
214 if (bootverbose) {
215 ecc_printf(sc, "LOG0_C0 %#x\n",
216 CSR_READ_4(sc, MCH_E3_ERRLOG0_C0));
217 ecc_printf(sc, "LOG0_C1 %#x\n",
218 CSR_READ_4(sc, MCH_E3_ERRLOG0_C1));
219 }
220
221 dimm_ch0 = CSR_READ_4(sc, MCH_CORE_DIMM_CH0);
222 dimm_ch1 = CSR_READ_4(sc, MCH_CORE_DIMM_CH1);
223
224 ecc_e3_attach_ch(sc, &sc->ecc_chan[0], 0, dimm_ch0,
225 MCH_E3_ERRLOG0_C0);
226 ecc_e3_attach_ch(sc, &sc->ecc_chan[1], 1, dimm_ch1,
227 MCH_E3_ERRLOG0_C1);
228
229 ecc_active = 1;
230 if (sc->ecc_ver == ECC_E3_VER_1 ||
231 sc->ecc_ver == ECC_E3_VER_2) {
232 if (((dimm_ch0 | dimm_ch1) & MCH_E3_DIMM_ECC) ==
233 MCH_E3_DIMM_ECC_NONE) {
234 ecc_active = 0;
235 ecc_printf(sc, "No ECC active\n");
236 }
237 } else { /* v3 */
238 uint32_t ecc_mode0, ecc_mode1;
239
240 ecc_mode0 = __SHIFTOUT(dimm_ch0, MCH_E3_DIMM_ECC);
241 ecc_mode1 = __SHIFTOUT(dimm_ch1, MCH_E3_DIMM_ECC);
242
243 /*
244 * Only active ALL/NONE is supported
245 */
246
247 if (ecc_mode0 != MCH_E3_DIMM_ECC_NONE &&
248 ecc_mode0 != MCH_E3_DIMM_ECC_ALL) {
249 ecc_active = 0;
250 ecc_printf(sc, "channel0, invalid ECC "
251 "active 0x%x\n", ecc_mode0);
252 }
253 if (ecc_mode1 != MCH_E3_DIMM_ECC_NONE &&
254 ecc_mode1 != MCH_E3_DIMM_ECC_ALL) {
255 ecc_active = 0;
256 ecc_printf(sc, "channel1, invalid ECC "
257 "active 0x%x\n", ecc_mode1);
258 }
259
260 if (ecc_mode0 == MCH_E3_DIMM_ECC_NONE &&
261 ecc_mode1 == MCH_E3_DIMM_ECC_NONE) {
262 ecc_active = 0;
263 ecc_printf(sc, "No ECC active\n");
264 }
265 }
266
267 if (!ecc_active)
268 return 0;
269 } else {
270 ecc_printf(sc, "MCHBAR is not enabled\n");
271
272 /*
273 * Add hw.sensors.eccN.ecc0 MIB.
274 */
275 strlcpy(sc->ecc_sensdev.xname, device_get_nameunit(dev),
276 sizeof(sc->ecc_sensdev.xname));
277 strlcpy(sc->ecc_sens.desc, "node0 ecc",
278 sizeof(sc->ecc_sens.desc));
279 sc->ecc_sens.type = SENSOR_ECC;
280 sensor_set(&sc->ecc_sens, 0, SENSOR_S_OK);
281 sensor_attach(&sc->ecc_sensdev, &sc->ecc_sens);
282 sensordev_install(&sc->ecc_sensdev);
283
284 sc->ecc_thresh = ECC_E3_THRESH_DEFAULT;
285 SYSCTL_ADD_INT(device_get_sysctl_ctx(dev),
286 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
287 OID_AUTO, "thresh", CTLFLAG_RW, &sc->ecc_thresh, 0,
288 "Raise alarm once number of ECC errors "
289 "goes above this value");
290 }
291
292 sc->ecc_flags |= ECC_E3_FLAG_SENSTASK;
293 sensor_task_register(sc, ecc_e3_sensor_task, 1);
294
295 return 0;
296 }
297
298 static void
ecc_e3_sensor_task(void * xsc)299 ecc_e3_sensor_task(void *xsc)
300 {
301 struct ecc_e3_softc *sc = xsc;
302 device_t dev = sc->ecc_dev;
303 uint16_t errsts;
304
305 errsts = pci_read_config(dev, PCI_E3_ERRSTS, 2);
306 if (errsts & (PCI_E3_ERRSTS_DSERR | PCI_E3_ERRSTS_DMERR)) {
307 boolean_t crit = FALSE;
308
309 if (errsts & PCI_E3_ERRSTS_DMERR)
310 crit = TRUE;
311
312 if (sc->ecc_parent != NULL)
313 ecc_e3_errlog(sc, crit);
314 else
315 ecc_e3_sensor_update(sc, crit);
316
317 /* Clear pending errors */
318 pci_write_config(dev, PCI_E3_ERRSTS, errsts, 2);
319 }
320 }
321
322 static void
ecc_e3_attach_ch(struct ecc_e3_softc * sc,struct ecc_e3_chan * chan,int chanid,uint32_t dimm_ch,int errlog0)323 ecc_e3_attach_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan,
324 int chanid, uint32_t dimm_ch, int errlog0)
325 {
326 int dimm_size[ECC_E3_CHAN_DIMM_MAX];
327 uint32_t dimm_szmask[ECC_E3_CHAN_DIMM_MAX];
328 uint32_t dimm_dlrank[ECC_E3_CHAN_DIMM_MAX];
329 int rank, dimm;
330
331 dimm_szmask[0] = MCH_CORE_DIMM_A_SIZE;
332 dimm_dlrank[0] = MCH_CORE_DIMM_A_DUAL_RANK;
333 dimm_szmask[1] = MCH_CORE_DIMM_B_SIZE;
334 dimm_dlrank[1] = MCH_CORE_DIMM_B_DUAL_RANK;
335 if (dimm_ch & MCH_CORE_DIMM_A_SELECT) {
336 dimm_szmask[0] = MCH_CORE_DIMM_B_SIZE;
337 dimm_dlrank[0] = MCH_CORE_DIMM_B_DUAL_RANK;
338 dimm_szmask[1] = MCH_CORE_DIMM_A_SIZE;
339 dimm_dlrank[1] = MCH_CORE_DIMM_A_DUAL_RANK;
340 }
341
342 dimm_size[0] = __SHIFTOUT(dimm_ch, dimm_szmask[0]);
343 dimm_size[1] = __SHIFTOUT(dimm_ch, dimm_szmask[1]);
344 if (dimm_size[0] == 0 && dimm_size[1] == 0)
345 return;
346
347 if (bootverbose) {
348 int ecc;
349
350 ecc = __SHIFTOUT(dimm_ch, MCH_E3_DIMM_ECC);
351 if (ecc == MCH_E3_DIMM_ECC_NONE) {
352 ecc_printf(sc, "channel%d, no ECC active\n", chanid);
353 } else if (ecc == MCH_E3_DIMM_ECC_ALL) {
354 ecc_printf(sc, "channel%d, ECC active IO/logic\n",
355 chanid);
356 } else {
357 if (sc->ecc_ver == ECC_E3_VER_1 ||
358 sc->ecc_ver == ECC_E3_VER_2) {
359 if (ecc == MCH_E3_DIMM_ECC_IO) {
360 ecc_printf(sc, "channel%d, "
361 "ECC active IO\n", chanid);
362 } else {
363 ecc_printf(sc, "channel%d, "
364 "ECC active logic\n", chanid);
365 }
366 } else { /* v3 */
367 ecc_printf(sc, "channel%d, "
368 "invalid ECC active 0x%x\n", chanid, ecc);
369 }
370 }
371 }
372
373 chan->chan_id = chanid;
374 chan->chan_errlog0 = errlog0;
375
376 rank = 0;
377 for (dimm = 0; dimm < ECC_E3_CHAN_DIMM_MAX; ++dimm) {
378 struct ecc_e3_dimm *dimm_sc;
379 struct ecc_e3_rank *rk;
380 struct ksensor *sens;
381
382 if (dimm_size[dimm] == 0)
383 continue;
384
385 dimm_sc = kmalloc(sizeof(*dimm_sc), M_DEVBUF,
386 M_WAITOK | M_ZERO);
387 dimm_sc->dimm_softc = dimm_create(0, chanid, dimm);
388
389 sens = &dimm_sc->dimm_sensor;
390 ksnprintf(sens->desc, sizeof(sens->desc),
391 "node0 chan%d DIMM%d ecc", chanid, dimm);
392 sens->type = SENSOR_ECC;
393 sensor_set(sens, 0, SENSOR_S_OK);
394 dimm_sensor_attach(dimm_sc->dimm_softc, sens);
395
396 TAILQ_INSERT_TAIL(&sc->ecc_dimm, dimm_sc, dimm_link);
397
398 KKASSERT(rank < ECC_E3_CHAN_RANK_MAX - 1);
399 rk = &chan->chan_rank[rank];
400 rank++;
401 rk->rank_dimm_sc = dimm_sc;
402 if (dimm_ch & dimm_dlrank[dimm]) {
403 rk = &chan->chan_rank[rank];
404 rank++;
405 rk->rank_dimm_sc = dimm_sc;
406 }
407 }
408 chan->chan_rank_cnt = rank;
409 }
410
411 static void
ecc_e3_errlog(struct ecc_e3_softc * sc,boolean_t crit)412 ecc_e3_errlog(struct ecc_e3_softc *sc, boolean_t crit)
413 {
414 int i;
415
416 for (i = 0; i < ECC_E3_CHAN_MAX; ++i) {
417 struct ecc_e3_chan *chan = &sc->ecc_chan[i];
418
419 if (chan->chan_errlog0 != 0)
420 ecc_e3_errlog_ch(sc, chan, crit);
421 }
422 }
423
424 static void
ecc_e3_errlog_ch(struct ecc_e3_softc * sc,struct ecc_e3_chan * chan,boolean_t crit)425 ecc_e3_errlog_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan,
426 boolean_t crit)
427 {
428 uint32_t err0;
429 int rank;
430
431 err0 = CSR_READ_4(sc, chan->chan_errlog0);
432 if ((err0 & (MCH_E3_ERRLOG0_CERRSTS | MCH_E3_ERRLOG0_MERRSTS)) == 0)
433 return;
434
435 rank = __SHIFTOUT(err0, MCH_E3_ERRLOG0_ERRRANK);
436 if (rank >= chan->chan_rank_cnt) {
437 ecc_printf(sc, "channel%d rank%d %serror\n", chan->chan_id,
438 rank, crit ? "critical " : "");
439 } else {
440 struct ecc_e3_dimm *dimm_sc;
441
442 dimm_sc = chan->chan_rank[rank].rank_dimm_sc;
443 dimm_sensor_ecc_add(dimm_sc->dimm_softc, &dimm_sc->dimm_sensor,
444 1, crit);
445 }
446 }
447
448 static int
ecc_e3_detach(device_t dev)449 ecc_e3_detach(device_t dev)
450 {
451 struct ecc_e3_softc *sc = device_get_softc(dev);
452
453 ecc_e3_stop(sc);
454
455 if (sc->ecc_parent != NULL) {
456 struct ecc_e3_dimm *dimm_sc;
457
458 while ((dimm_sc = TAILQ_FIRST(&sc->ecc_dimm)) != NULL) {
459 TAILQ_REMOVE(&sc->ecc_dimm, dimm_sc, dimm_link);
460 dimm_sensor_detach(dimm_sc->dimm_softc,
461 &dimm_sc->dimm_sensor);
462 dimm_destroy(dimm_sc->dimm_softc);
463
464 kfree(dimm_sc, M_DEVBUF);
465 }
466 } else {
467 sensordev_deinstall(&sc->ecc_sensdev);
468 }
469 return 0;
470 }
471
472 static void
ecc_e3_shutdown(device_t dev)473 ecc_e3_shutdown(device_t dev)
474 {
475 ecc_e3_stop(device_get_softc(dev));
476 }
477
478 static void
ecc_e3_stop(struct ecc_e3_softc * sc)479 ecc_e3_stop(struct ecc_e3_softc *sc)
480 {
481 if (sc->ecc_flags & ECC_E3_FLAG_SENSTASK)
482 sensor_task_unregister(sc);
483 }
484
485 static void
ecc_e3_sensor_update(struct ecc_e3_softc * sc,boolean_t crit)486 ecc_e3_sensor_update(struct ecc_e3_softc *sc, boolean_t crit)
487 {
488 enum sensor_status status;
489
490 sc->ecc_count++;
491 if (!crit && sc->ecc_count >= sc->ecc_thresh)
492 crit = TRUE;
493
494 if (crit && (sc->ecc_flags & ECC_E3_FLAG_CRIT) == 0) {
495 char ecc_str[16];
496
497 ksnprintf(ecc_str, sizeof(ecc_str), "%d", sc->ecc_count);
498 devctl_notify("ecc", "ECC", ecc_str, "node=0");
499
500 ecc_printf(sc, "too many ECC errors %d\n", sc->ecc_count);
501 sc->ecc_flags |= ECC_E3_FLAG_CRIT;
502 }
503
504 if (sc->ecc_flags & ECC_E3_FLAG_CRIT)
505 status = SENSOR_S_CRIT;
506 else
507 status = SENSOR_S_OK;
508 sensor_set(&sc->ecc_sens, sc->ecc_count, status);
509 }
510