xref: /dragonfly/sys/dev/misc/ecc/ecc_e3.c (revision 9348a738)
1 /*
2  * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Sepherosa Ziehau <sepherosa@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/bus.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/bitops.h>
41 #include <sys/sensors.h>
42 
43 #include <bus/pci/pcivar.h>
44 #include <bus/pci/pcireg.h>
45 #include <bus/pci/pci_cfgreg.h>
46 
47 #include <vm/pmap.h>
48 
49 #include "coremctl_if.h"
50 #include "pcib_if.h"
51 
52 #include <dev/misc/dimm/dimm.h>
53 #include <dev/misc/coremctl/coremctl_reg.h>
54 
55 #define ECC_E3_VER_1	1	/* Sandy Bridge */
56 #define ECC_E3_VER_2	2	/* Ivy Bridge */
57 #define ECC_E3_VER_3	3	/* Haswell */
58 
59 #define ECC_E3_THRESH_DEFAULT	5
60 
61 #define ECC_E3_CHAN_MAX		2
62 #define ECC_E3_CHAN_DIMM_MAX	2
63 #define ECC_E3_DIMM_RANK_MAX	2
64 #define ECC_E3_CHAN_RANK_MAX	(ECC_E3_CHAN_DIMM_MAX  * ECC_E3_DIMM_RANK_MAX)
65 
66 struct ecc_e3_type {
67 	uint16_t	did;
68 	const char	*desc;
69 	int		ver;		/* ECC_E3_VER_ */
70 };
71 
72 struct ecc_e3_dimm {
73 	TAILQ_ENTRY(ecc_e3_dimm) dimm_link;
74 	struct dimm_softc	*dimm_softc;
75 	struct ksensor		dimm_sensor;
76 };
77 
78 struct ecc_e3_rank {
79 	struct ecc_e3_dimm	*rank_dimm_sc;
80 };
81 
82 struct ecc_e3_chan {
83 	int			chan_id;
84 	int			chan_errlog0;
85 	int			chan_rank_cnt;
86 	struct ecc_e3_rank	chan_rank[ECC_E3_CHAN_RANK_MAX];
87 };
88 
89 struct ecc_e3_softc {
90 	device_t	ecc_dev;
91 	device_t	ecc_parent;	/* non-NULL if parent has MCHBAR */
92 	int		ecc_ver;	/* ECC_E3_VER_ */
93 	uint32_t	ecc_flags;	/* ECC_E3_FLAG_ */
94 
95 	struct ecc_e3_chan ecc_chan[ECC_E3_CHAN_MAX];
96 	TAILQ_HEAD(, ecc_e3_dimm) ecc_dimm;
97 
98 	/*
99 	 * If the parent does not have MCHBAR,
100 	 * i.e. no DIMM location information
101 	 * for the ECC errors, fallback to the
102 	 * sensor and counters below.
103 	 */
104 	struct ksensordev ecc_sensdev;
105 	struct ksensor	ecc_sens;
106 	int		ecc_count;
107 	int		ecc_thresh;
108 };
109 
110 #define ECC_E3_FLAG_SENSTASK	0x1
111 #define ECC_E3_FLAG_CRIT	0x2
112 
113 #define ecc_printf(sc, fmt, arg...) \
114 	device_printf((sc)->ecc_dev, fmt , ##arg)
115 
116 static int	ecc_e3_probe(device_t);
117 static int	ecc_e3_attach(device_t);
118 static int	ecc_e3_detach(device_t);
119 static void	ecc_e3_shutdown(device_t);
120 
121 static void	ecc_e3_attach_ch(struct ecc_e3_softc *, struct ecc_e3_chan *,
122 		    int, uint32_t, int);
123 static void	ecc_e3_errlog(struct ecc_e3_softc *, boolean_t);
124 static void	ecc_e3_errlog_ch(struct ecc_e3_softc *, struct ecc_e3_chan *,
125 		    boolean_t);
126 static void	ecc_e3_stop(struct ecc_e3_softc *);
127 
128 static void	ecc_e3_sensor_task(void *);
129 static void	ecc_e3_sensor_update(struct ecc_e3_softc *, boolean_t);
130 
131 static const struct ecc_e3_type ecc_e3_types[] = {
132 	{ PCI_E3V1_MEMCTL_DID, "Intel E3 ECC", ECC_E3_VER_1 },
133 	{ PCI_E3V2_MEMCTL_DID, "Intel E3 v2 ECC", ECC_E3_VER_2 },
134 	{ PCI_E3V3_MEMCTL_DID, "Intel E3 v3 ECC", ECC_E3_VER_3 },
135 	{ 0, NULL, 0 } /* required last entry */
136 };
137 
138 static device_method_t ecc_e3_methods[] = {
139 	/* Device interface */
140 	DEVMETHOD(device_probe,		ecc_e3_probe),
141 	DEVMETHOD(device_attach,	ecc_e3_attach),
142 	DEVMETHOD(device_detach,	ecc_e3_detach),
143 	DEVMETHOD(device_shutdown,	ecc_e3_shutdown),
144 	DEVMETHOD(device_suspend,	bus_generic_suspend),
145 	DEVMETHOD(device_resume,	bus_generic_resume),
146 	DEVMETHOD_END
147 };
148 
149 static driver_t ecc_e3_driver = {
150 	"ecc",
151 	ecc_e3_methods,
152 	sizeof(struct ecc_e3_softc)
153 };
154 static devclass_t ecc_devclass;
155 DRIVER_MODULE(ecc_e3, coremctl, ecc_e3_driver, ecc_devclass, NULL, NULL);
156 MODULE_DEPEND(ecc_e3, pci, 1, 1, 1);
157 MODULE_DEPEND(ecc_e3, coremctl, 1, 1, 1);
158 MODULE_VERSION(ecc_e3, 1);
159 
160 static __inline uint32_t
161 CSR_READ_4(struct ecc_e3_softc *sc, int ofs)
162 {
163 	uint32_t val;
164 	int error;
165 
166 	error = COREMCTL_MCH_READ(sc->ecc_parent, ofs, &val);
167 	KASSERT(!error, ("mch read failed"));
168 
169 	return val;
170 }
171 
172 static int
173 ecc_e3_probe(device_t dev)
174 {
175 	const struct ecc_e3_type *t;
176 	uint16_t did;
177 
178 	if (pci_get_vendor(dev) != PCI_CORE_MEMCTL_VID)
179 		return ENXIO;
180 
181 	did = pci_get_device(dev);
182 	for (t = ecc_e3_types; t->desc != NULL; ++t) {
183 		if (t->did == did) {
184 			struct ecc_e3_softc *sc = device_get_softc(dev);
185 
186 			device_set_desc(dev, t->desc);
187 			sc->ecc_ver = t->ver;
188 			return 0;
189 		}
190 	}
191 	return ENXIO;
192 }
193 
194 static int
195 ecc_e3_attach(device_t dev)
196 {
197 	struct ecc_e3_softc *sc = device_get_softc(dev);
198 	uint32_t val;
199 	int error;
200 
201 	TAILQ_INIT(&sc->ecc_dimm);
202 	sc->ecc_dev = dev;
203 
204 	/* Probe the existance of MCHBAR */
205 	error = COREMCTL_MCH_READ(device_get_parent(dev), MCH_CORE_DIMM_CH0,
206 	    &val);
207 	if (!error)
208 		sc->ecc_parent = device_get_parent(dev);
209 
210 	if (sc->ecc_parent != NULL) {
211 		uint32_t dimm_ch0, dimm_ch1;
212 		int ecc_active;
213 
214 		if (bootverbose) {
215 			ecc_printf(sc, "LOG0_C0 %#x\n",
216 			    CSR_READ_4(sc, MCH_E3_ERRLOG0_C0));
217 			ecc_printf(sc, "LOG0_C1 %#x\n",
218 			    CSR_READ_4(sc, MCH_E3_ERRLOG0_C1));
219 		}
220 
221 		dimm_ch0 = CSR_READ_4(sc, MCH_CORE_DIMM_CH0);
222 		dimm_ch1 = CSR_READ_4(sc, MCH_CORE_DIMM_CH1);
223 
224 		ecc_e3_attach_ch(sc, &sc->ecc_chan[0], 0, dimm_ch0,
225 		    MCH_E3_ERRLOG0_C0);
226 		ecc_e3_attach_ch(sc, &sc->ecc_chan[1], 1, dimm_ch1,
227 		    MCH_E3_ERRLOG0_C1);
228 
229 		ecc_active = 1;
230 		if (sc->ecc_ver == ECC_E3_VER_1 ||
231 		    sc->ecc_ver == ECC_E3_VER_2) {
232 			if (((dimm_ch0 | dimm_ch1) & MCH_E3_DIMM_ECC) ==
233 			    MCH_E3_DIMM_ECC_NONE) {
234 				ecc_active = 0;
235 				ecc_printf(sc, "No ECC active\n");
236 			}
237 		} else { /* v3 */
238 			uint32_t ecc_mode0, ecc_mode1;
239 
240 			ecc_mode0 = __SHIFTOUT(dimm_ch0, MCH_E3_DIMM_ECC);
241 			ecc_mode1 = __SHIFTOUT(dimm_ch1, MCH_E3_DIMM_ECC);
242 
243 			/*
244 			 * Only active ALL/NONE is supported
245 			 */
246 
247 			if (ecc_mode0 != MCH_E3_DIMM_ECC_NONE &&
248 			    ecc_mode0 != MCH_E3_DIMM_ECC_ALL) {
249 				ecc_active = 0;
250 				ecc_printf(sc, "channel0, invalid ECC "
251 				    "active 0x%x\n", ecc_mode0);
252 			}
253 			if (ecc_mode1 != MCH_E3_DIMM_ECC_NONE &&
254 			    ecc_mode1 != MCH_E3_DIMM_ECC_ALL) {
255 				ecc_active = 0;
256 				ecc_printf(sc, "channel1, invalid ECC "
257 				    "active 0x%x\n", ecc_mode1);
258 			}
259 
260 			if (ecc_mode0 == MCH_E3_DIMM_ECC_NONE &&
261 			    ecc_mode1 == MCH_E3_DIMM_ECC_NONE) {
262 				ecc_active = 0;
263 				ecc_printf(sc, "No ECC active\n");
264 			}
265 		}
266 
267 		if (!ecc_active)
268 			return 0;
269 	} else {
270 		ecc_printf(sc, "MCHBAR is not enabled\n");
271 
272 		/*
273 		 * Add hw.sensors.eccN.ecc0 MIB.
274 		 */
275 		strlcpy(sc->ecc_sensdev.xname, device_get_nameunit(dev),
276 		    sizeof(sc->ecc_sensdev.xname));
277 		strlcpy(sc->ecc_sens.desc, "node0 ecc",
278 		    sizeof(sc->ecc_sens.desc));
279 		sc->ecc_sens.type = SENSOR_ECC;
280 		sensor_set(&sc->ecc_sens, 0, SENSOR_S_OK);
281 		sensor_attach(&sc->ecc_sensdev, &sc->ecc_sens);
282 		sensordev_install(&sc->ecc_sensdev);
283 
284 		sc->ecc_thresh = ECC_E3_THRESH_DEFAULT;
285 		SYSCTL_ADD_INT(device_get_sysctl_ctx(dev),
286 		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
287 		    OID_AUTO, "thresh", CTLFLAG_RW, &sc->ecc_thresh, 0,
288 		    "Raise alarm once number of ECC errors "
289 		    "goes above this value");
290 	}
291 
292 	sc->ecc_flags |= ECC_E3_FLAG_SENSTASK;
293 	sensor_task_register(sc, ecc_e3_sensor_task, 1);
294 
295 	return 0;
296 }
297 
298 static void
299 ecc_e3_sensor_task(void *xsc)
300 {
301 	struct ecc_e3_softc *sc = xsc;
302 	device_t dev = sc->ecc_dev;
303 	uint16_t errsts;
304 
305 	errsts = pci_read_config(dev, PCI_E3_ERRSTS, 2);
306 	if (errsts & (PCI_E3_ERRSTS_DSERR | PCI_E3_ERRSTS_DMERR)) {
307 		boolean_t crit = FALSE;
308 
309 		if (errsts & PCI_E3_ERRSTS_DMERR)
310 			crit = TRUE;
311 
312 		if (sc->ecc_parent != NULL)
313 			ecc_e3_errlog(sc, crit);
314 		else
315 			ecc_e3_sensor_update(sc, crit);
316 
317 		/* Clear pending errors */
318 		pci_write_config(dev, PCI_E3_ERRSTS, errsts, 2);
319 	}
320 }
321 
322 static void
323 ecc_e3_attach_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan,
324     int chanid, uint32_t dimm_ch, int errlog0)
325 {
326 	int dimm_size[ECC_E3_CHAN_DIMM_MAX];
327 	uint32_t dimm_szmask[ECC_E3_CHAN_DIMM_MAX];
328 	uint32_t dimm_dlrank[ECC_E3_CHAN_DIMM_MAX];
329 	int rank, dimm;
330 
331 	dimm_szmask[0] = MCH_CORE_DIMM_A_SIZE;
332 	dimm_dlrank[0] = MCH_CORE_DIMM_A_DUAL_RANK;
333 	dimm_szmask[1] = MCH_CORE_DIMM_B_SIZE;
334 	dimm_dlrank[1] = MCH_CORE_DIMM_B_DUAL_RANK;
335 	if (dimm_ch & MCH_CORE_DIMM_A_SELECT) {
336 		dimm_szmask[0] = MCH_CORE_DIMM_B_SIZE;
337 		dimm_dlrank[0] = MCH_CORE_DIMM_B_DUAL_RANK;
338 		dimm_szmask[1] = MCH_CORE_DIMM_A_SIZE;
339 		dimm_dlrank[1] = MCH_CORE_DIMM_A_DUAL_RANK;
340 	}
341 
342 	dimm_size[0] = __SHIFTOUT(dimm_ch, dimm_szmask[0]);
343 	dimm_size[1] = __SHIFTOUT(dimm_ch, dimm_szmask[1]);
344 	if (dimm_size[0] == 0 && dimm_size[1] == 0)
345 		return;
346 
347 	if (bootverbose) {
348 		int ecc;
349 
350 		ecc = __SHIFTOUT(dimm_ch, MCH_E3_DIMM_ECC);
351 		if (ecc == MCH_E3_DIMM_ECC_NONE) {
352 			ecc_printf(sc, "channel%d, no ECC active\n", chanid);
353 		} else if (ecc == MCH_E3_DIMM_ECC_ALL) {
354 			ecc_printf(sc, "channel%d, ECC active IO/logic\n",
355 			    chanid);
356 		} else {
357 			if (sc->ecc_ver == ECC_E3_VER_1 ||
358 			    sc->ecc_ver == ECC_E3_VER_2) {
359 				if (ecc == MCH_E3_DIMM_ECC_IO) {
360 					ecc_printf(sc, "channel%d, "
361 					    "ECC active IO\n", chanid);
362 				} else {
363 					ecc_printf(sc, "channel%d, "
364 					    "ECC active logic\n", chanid);
365 				}
366 			} else { /* v3 */
367 				ecc_printf(sc, "channel%d, "
368 				    "invalid ECC active 0x%x\n", chanid, ecc);
369 			}
370 		}
371 	}
372 
373 	chan->chan_id = chanid;
374 	chan->chan_errlog0 = errlog0;
375 
376 	rank = 0;
377 	for (dimm = 0; dimm < ECC_E3_CHAN_DIMM_MAX; ++dimm) {
378 		struct ecc_e3_dimm *dimm_sc;
379 		struct ecc_e3_rank *rk;
380 		struct ksensor *sens;
381 
382 		if (dimm_size[dimm] == 0)
383 			continue;
384 
385 		dimm_sc = kmalloc(sizeof(*dimm_sc), M_DEVBUF,
386 		    M_WAITOK | M_ZERO);
387 		dimm_sc->dimm_softc = dimm_create(0, chanid, dimm);
388 
389 		sens = &dimm_sc->dimm_sensor;
390 		ksnprintf(sens->desc, sizeof(sens->desc),
391 		    "node0 chan%d DIMM%d ecc", chanid, dimm);
392 		sens->type = SENSOR_ECC;
393 		sensor_set(sens, 0, SENSOR_S_OK);
394 		dimm_sensor_attach(dimm_sc->dimm_softc, sens);
395 
396 		TAILQ_INSERT_TAIL(&sc->ecc_dimm, dimm_sc, dimm_link);
397 
398 		KKASSERT(rank < ECC_E3_CHAN_RANK_MAX - 1);
399 		rk = &chan->chan_rank[rank];
400 		rank++;
401 		rk->rank_dimm_sc = dimm_sc;
402 		if (dimm_ch & dimm_dlrank[dimm]) {
403 			rk = &chan->chan_rank[rank];
404 			rank++;
405 			rk->rank_dimm_sc = dimm_sc;
406 		}
407 	}
408 	chan->chan_rank_cnt = rank;
409 }
410 
411 static void
412 ecc_e3_errlog(struct ecc_e3_softc *sc, boolean_t crit)
413 {
414 	int i;
415 
416 	for (i = 0; i < ECC_E3_CHAN_MAX; ++i) {
417 		struct ecc_e3_chan *chan = &sc->ecc_chan[i];
418 
419 		if (chan->chan_errlog0 != 0)
420 			ecc_e3_errlog_ch(sc, chan, crit);
421 	}
422 }
423 
424 static void
425 ecc_e3_errlog_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan,
426     boolean_t crit)
427 {
428 	uint32_t err0;
429 	int rank;
430 
431 	err0 = CSR_READ_4(sc, chan->chan_errlog0);
432 	if ((err0 & (MCH_E3_ERRLOG0_CERRSTS | MCH_E3_ERRLOG0_MERRSTS)) == 0)
433 		return;
434 
435 	rank = __SHIFTOUT(err0, MCH_E3_ERRLOG0_ERRRANK);
436 	if (rank >= chan->chan_rank_cnt) {
437 		ecc_printf(sc, "channel%d rank%d %serror\n", chan->chan_id,
438 		    rank, crit ? "critical " : "");
439 	} else {
440 		struct ecc_e3_dimm *dimm_sc;
441 
442 		dimm_sc = chan->chan_rank[rank].rank_dimm_sc;
443 		dimm_sensor_ecc_add(dimm_sc->dimm_softc, &dimm_sc->dimm_sensor,
444 		    1, crit);
445 	}
446 }
447 
448 static int
449 ecc_e3_detach(device_t dev)
450 {
451 	struct ecc_e3_softc *sc = device_get_softc(dev);
452 
453 	ecc_e3_stop(sc);
454 
455 	if (sc->ecc_parent != NULL) {
456 		struct ecc_e3_dimm *dimm_sc;
457 
458 		while ((dimm_sc = TAILQ_FIRST(&sc->ecc_dimm)) != NULL) {
459 			TAILQ_REMOVE(&sc->ecc_dimm, dimm_sc, dimm_link);
460 			dimm_sensor_detach(dimm_sc->dimm_softc,
461 			    &dimm_sc->dimm_sensor);
462 			dimm_destroy(dimm_sc->dimm_softc);
463 
464 			kfree(dimm_sc, M_DEVBUF);
465 		}
466 	} else {
467 		sensordev_deinstall(&sc->ecc_sensdev);
468 	}
469 	return 0;
470 }
471 
472 static void
473 ecc_e3_shutdown(device_t dev)
474 {
475 	ecc_e3_stop(device_get_softc(dev));
476 }
477 
478 static void
479 ecc_e3_stop(struct ecc_e3_softc *sc)
480 {
481 	if (sc->ecc_flags & ECC_E3_FLAG_SENSTASK)
482 		sensor_task_unregister(sc);
483 }
484 
485 static void
486 ecc_e3_sensor_update(struct ecc_e3_softc *sc, boolean_t crit)
487 {
488 	enum sensor_status status;
489 
490 	sc->ecc_count++;
491 	if (!crit && sc->ecc_count >= sc->ecc_thresh)
492 		crit = TRUE;
493 
494 	if (crit && (sc->ecc_flags & ECC_E3_FLAG_CRIT) == 0) {
495 		char ecc_str[16];
496 
497 		ksnprintf(ecc_str, sizeof(ecc_str), "%d", sc->ecc_count);
498 		devctl_notify("ecc", "ECC", ecc_str, "node=0");
499 
500 		ecc_printf(sc, "too many ECC errors %d\n", sc->ecc_count);
501 		sc->ecc_flags |= ECC_E3_FLAG_CRIT;
502 	}
503 
504 	if (sc->ecc_flags & ECC_E3_FLAG_CRIT)
505 		status = SENSOR_S_CRIT;
506 	else
507 		status = SENSOR_S_OK;
508 	sensor_set(&sc->ecc_sens, sc->ecc_count, status);
509 }
510