xref: /dragonfly/sys/dev/misc/ecc/ecc_e3.c (revision 0de090e1)
1 /*
2  * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Sepherosa Ziehau <sepherosa@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/bus.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/bitops.h>
41 #include <sys/sensors.h>
42 
43 #include <bus/pci/pcivar.h>
44 #include <bus/pci/pcireg.h>
45 #include <bus/pci/pci_cfgreg.h>
46 
47 #include <vm/pmap.h>
48 
49 #include "coremctl_if.h"
50 #include "pcib_if.h"
51 
52 #include <dev/misc/dimm/dimm.h>
53 #include <dev/misc/coremctl/coremctl_reg.h>
54 
55 #define ECC_E3_VER_1	1	/* Sandy Bridge */
56 #define ECC_E3_VER_2	2	/* Ivy Bridge */
57 #define ECC_E3_VER_3	3	/* Haswell */
58 
59 #define ECC_E3_THRESH_DEFAULT	5
60 
61 #define ECC_E3_CHAN_MAX		2
62 #define ECC_E3_CHAN_DIMM_MAX	2
63 #define ECC_E3_DIMM_RANK_MAX	2
64 #define ECC_E3_CHAN_RANK_MAX	(ECC_E3_CHAN_DIMM_MAX  * ECC_E3_DIMM_RANK_MAX)
65 
66 struct ecc_e3_type {
67 	uint16_t	did;
68 	const char	*desc;
69 	int		ver;		/* ECC_E3_VER_ */
70 };
71 
72 struct ecc_e3_dimm {
73 	TAILQ_ENTRY(ecc_e3_dimm) dimm_link;
74 	struct dimm_softc	*dimm_softc;
75 	struct ksensor		dimm_sensor;
76 };
77 
78 struct ecc_e3_rank {
79 	struct ecc_e3_dimm	*rank_dimm_sc;
80 };
81 
82 struct ecc_e3_chan {
83 	int			chan_id;
84 	int			chan_errlog0;
85 	int			chan_rank_cnt;
86 	struct ecc_e3_rank	chan_rank[ECC_E3_CHAN_RANK_MAX];
87 };
88 
89 struct ecc_e3_softc {
90 	device_t	ecc_dev;
91 	device_t	ecc_parent;	/* non-NULL if parent has MCHBAR */
92 	int		ecc_ver;	/* ECC_E3_VER_ */
93 	uint32_t	ecc_flags;	/* ECC_E3_FLAG_ */
94 
95 	struct ecc_e3_chan ecc_chan[ECC_E3_CHAN_MAX];
96 	TAILQ_HEAD(, ecc_e3_dimm) ecc_dimm;
97 
98 	/*
99 	 * If the parent does not have MCHBAR,
100 	 * i.e. no DIMM location information
101 	 * for the ECC errors, fallback to the
102 	 * sensor and counters below.
103 	 */
104 	struct ksensordev ecc_sensdev;
105 	struct ksensor	ecc_sens;
106 	int		ecc_count;
107 	int		ecc_thresh;
108 };
109 
110 #define ECC_E3_FLAG_SENSTASK	0x1
111 #define ECC_E3_FLAG_CRIT	0x2
112 
113 #define ecc_printf(sc, fmt, arg...) \
114 	device_printf((sc)->ecc_dev, fmt , ##arg)
115 
116 static int	ecc_e3_probe(device_t);
117 static int	ecc_e3_attach(device_t);
118 static int	ecc_e3_detach(device_t);
119 static void	ecc_e3_shutdown(device_t);
120 
121 static void	ecc_e3_attach_ch(struct ecc_e3_softc *, struct ecc_e3_chan *,
122 		    int, uint32_t, int);
123 static void	ecc_e3_errlog(struct ecc_e3_softc *, boolean_t);
124 static void	ecc_e3_errlog_ch(struct ecc_e3_softc *, struct ecc_e3_chan *,
125 		    boolean_t);
126 static void	ecc_e3_stop(struct ecc_e3_softc *);
127 
128 static void	ecc_e3_sensor_task(void *);
129 static void	ecc_e3_sensor_update(struct ecc_e3_softc *, boolean_t);
130 
131 static const struct ecc_e3_type ecc_e3_types[] = {
132 	{ PCI_E3V1_MEMCTL_DID, "Intel E3 ECC", ECC_E3_VER_1 },
133 	{ PCI_E3V2_MEMCTL_DID, "Intel E3 v2 ECC", ECC_E3_VER_2 },
134 	{ PCI_E3V3_MEMCTL_DID, "Intel E3 v3 ECC", ECC_E3_VER_3 },
135 	{ 0, NULL, 0 } /* required last entry */
136 };
137 
138 static device_method_t ecc_e3_methods[] = {
139 	/* Device interface */
140 	DEVMETHOD(device_probe,		ecc_e3_probe),
141 	DEVMETHOD(device_attach,	ecc_e3_attach),
142 	DEVMETHOD(device_detach,	ecc_e3_detach),
143 	DEVMETHOD(device_shutdown,	ecc_e3_shutdown),
144 	DEVMETHOD(device_suspend,	bus_generic_suspend),
145 	DEVMETHOD(device_resume,	bus_generic_resume),
146 	DEVMETHOD_END
147 };
148 
149 static driver_t ecc_e3_driver = {
150 	"ecc",
151 	ecc_e3_methods,
152 	sizeof(struct ecc_e3_softc)
153 };
154 static devclass_t ecc_devclass;
155 DRIVER_MODULE(ecc_e3, coremctl, ecc_e3_driver, ecc_devclass, NULL, NULL);
156 MODULE_DEPEND(ecc_e3, pci, 1, 1, 1);
157 MODULE_DEPEND(ecc_e3, coremctl, 1, 1, 1);
158 
159 static __inline uint32_t
160 CSR_READ_4(struct ecc_e3_softc *sc, int ofs)
161 {
162 	uint32_t val;
163 	int error;
164 
165 	error = COREMCTL_MCH_READ(sc->ecc_parent, ofs, &val);
166 	KASSERT(!error, ("mch read failed"));
167 
168 	return val;
169 }
170 
171 static int
172 ecc_e3_probe(device_t dev)
173 {
174 	const struct ecc_e3_type *t;
175 	uint16_t did;
176 
177 	if (pci_get_vendor(dev) != PCI_CORE_MEMCTL_VID)
178 		return ENXIO;
179 
180 	did = pci_get_device(dev);
181 	for (t = ecc_e3_types; t->desc != NULL; ++t) {
182 		if (t->did == did) {
183 			struct ecc_e3_softc *sc = device_get_softc(dev);
184 
185 			device_set_desc(dev, t->desc);
186 			sc->ecc_ver = t->ver;
187 			return 0;
188 		}
189 	}
190 	return ENXIO;
191 }
192 
193 static int
194 ecc_e3_attach(device_t dev)
195 {
196 	struct ecc_e3_softc *sc = device_get_softc(dev);
197 	uint32_t val;
198 	int error;
199 
200 	TAILQ_INIT(&sc->ecc_dimm);
201 	sc->ecc_dev = dev;
202 
203 	/* Probe the existance of MCHBAR */
204 	error = COREMCTL_MCH_READ(device_get_parent(dev), MCH_CORE_DIMM_CH0,
205 	    &val);
206 	if (!error)
207 		sc->ecc_parent = device_get_parent(dev);
208 
209 	if (sc->ecc_parent != NULL) {
210 		uint32_t dimm_ch0, dimm_ch1;
211 		int ecc_active;
212 
213 		if (bootverbose) {
214 			ecc_printf(sc, "LOG0_C0 %#x\n",
215 			    CSR_READ_4(sc, MCH_E3_ERRLOG0_C0));
216 			ecc_printf(sc, "LOG0_C1 %#x\n",
217 			    CSR_READ_4(sc, MCH_E3_ERRLOG0_C1));
218 		}
219 
220 		dimm_ch0 = CSR_READ_4(sc, MCH_CORE_DIMM_CH0);
221 		dimm_ch1 = CSR_READ_4(sc, MCH_CORE_DIMM_CH1);
222 
223 		ecc_e3_attach_ch(sc, &sc->ecc_chan[0], 0, dimm_ch0,
224 		    MCH_E3_ERRLOG0_C0);
225 		ecc_e3_attach_ch(sc, &sc->ecc_chan[1], 1, dimm_ch1,
226 		    MCH_E3_ERRLOG0_C1);
227 
228 		ecc_active = 1;
229 		if (sc->ecc_ver == ECC_E3_VER_1 ||
230 		    sc->ecc_ver == ECC_E3_VER_2) {
231 			if (((dimm_ch0 | dimm_ch1) & MCH_E3_DIMM_ECC) ==
232 			    MCH_E3_DIMM_ECC_NONE) {
233 				ecc_active = 0;
234 				ecc_printf(sc, "No ECC active\n");
235 			}
236 		} else { /* v3 */
237 			uint32_t ecc_mode0, ecc_mode1;
238 
239 			ecc_mode0 = __SHIFTOUT(dimm_ch0, MCH_E3_DIMM_ECC);
240 			ecc_mode1 = __SHIFTOUT(dimm_ch1, MCH_E3_DIMM_ECC);
241 
242 			/*
243 			 * Only active ALL/NONE is supported
244 			 */
245 
246 			if (ecc_mode0 != MCH_E3_DIMM_ECC_NONE &&
247 			    ecc_mode0 != MCH_E3_DIMM_ECC_ALL) {
248 				ecc_active = 0;
249 				ecc_printf(sc, "channel0, invalid ECC "
250 				    "active 0x%x\n", ecc_mode0);
251 			}
252 			if (ecc_mode1 != MCH_E3_DIMM_ECC_NONE &&
253 			    ecc_mode1 != MCH_E3_DIMM_ECC_ALL) {
254 				ecc_active = 0;
255 				ecc_printf(sc, "channel1, invalid ECC "
256 				    "active 0x%x\n", ecc_mode1);
257 			}
258 
259 			if (ecc_mode0 == MCH_E3_DIMM_ECC_NONE &&
260 			    ecc_mode1 == MCH_E3_DIMM_ECC_NONE) {
261 				ecc_active = 0;
262 				ecc_printf(sc, "No ECC active\n");
263 			}
264 		}
265 
266 		if (!ecc_active)
267 			return 0;
268 	} else {
269 		ecc_printf(sc, "MCHBAR is not enabled\n");
270 
271 		/*
272 		 * Add hw.sensors.eccN.ecc0 MIB.
273 		 */
274 		strlcpy(sc->ecc_sensdev.xname, device_get_nameunit(dev),
275 		    sizeof(sc->ecc_sensdev.xname));
276 		strlcpy(sc->ecc_sens.desc, "node0 ecc",
277 		    sizeof(sc->ecc_sens.desc));
278 		sc->ecc_sens.type = SENSOR_ECC;
279 		sensor_set(&sc->ecc_sens, 0, SENSOR_S_OK);
280 		sensor_attach(&sc->ecc_sensdev, &sc->ecc_sens);
281 		sensordev_install(&sc->ecc_sensdev);
282 
283 		sc->ecc_thresh = ECC_E3_THRESH_DEFAULT;
284 		SYSCTL_ADD_INT(device_get_sysctl_ctx(dev),
285 		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
286 		    OID_AUTO, "thresh", CTLFLAG_RW, &sc->ecc_thresh, 0,
287 		    "Raise alarm once number of ECC errors "
288 		    "goes above this value");
289 	}
290 
291 	sc->ecc_flags |= ECC_E3_FLAG_SENSTASK;
292 	sensor_task_register(sc, ecc_e3_sensor_task, 1);
293 
294 	return 0;
295 }
296 
297 static void
298 ecc_e3_sensor_task(void *xsc)
299 {
300 	struct ecc_e3_softc *sc = xsc;
301 	device_t dev = sc->ecc_dev;
302 	uint16_t errsts;
303 
304 	errsts = pci_read_config(dev, PCI_E3_ERRSTS, 2);
305 	if (errsts & (PCI_E3_ERRSTS_DSERR | PCI_E3_ERRSTS_DMERR)) {
306 		boolean_t crit = FALSE;
307 
308 		if (errsts & PCI_E3_ERRSTS_DMERR)
309 			crit = TRUE;
310 
311 		if (sc->ecc_parent != NULL)
312 			ecc_e3_errlog(sc, crit);
313 		else
314 			ecc_e3_sensor_update(sc, crit);
315 
316 		/* Clear pending errors */
317 		pci_write_config(dev, PCI_E3_ERRSTS, errsts, 2);
318 	}
319 }
320 
321 static void
322 ecc_e3_attach_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan,
323     int chanid, uint32_t dimm_ch, int errlog0)
324 {
325 	int dimm_size[ECC_E3_CHAN_DIMM_MAX];
326 	uint32_t dimm_szmask[ECC_E3_CHAN_DIMM_MAX];
327 	uint32_t dimm_dlrank[ECC_E3_CHAN_DIMM_MAX];
328 	int rank, dimm;
329 
330 	dimm_szmask[0] = MCH_CORE_DIMM_A_SIZE;
331 	dimm_dlrank[0] = MCH_CORE_DIMM_A_DUAL_RANK;
332 	dimm_szmask[1] = MCH_CORE_DIMM_B_SIZE;
333 	dimm_dlrank[1] = MCH_CORE_DIMM_B_DUAL_RANK;
334 	if (dimm_ch & MCH_CORE_DIMM_A_SELECT) {
335 		dimm_szmask[0] = MCH_CORE_DIMM_B_SIZE;
336 		dimm_dlrank[0] = MCH_CORE_DIMM_B_DUAL_RANK;
337 		dimm_szmask[1] = MCH_CORE_DIMM_A_SIZE;
338 		dimm_dlrank[1] = MCH_CORE_DIMM_A_DUAL_RANK;
339 	}
340 
341 	dimm_size[0] = __SHIFTOUT(dimm_ch, dimm_szmask[0]);
342 	dimm_size[1] = __SHIFTOUT(dimm_ch, dimm_szmask[1]);
343 	if (dimm_size[0] == 0 && dimm_size[1] == 0)
344 		return;
345 
346 	if (bootverbose) {
347 		int ecc;
348 
349 		ecc = __SHIFTOUT(dimm_ch, MCH_E3_DIMM_ECC);
350 		if (ecc == MCH_E3_DIMM_ECC_NONE) {
351 			ecc_printf(sc, "channel%d, no ECC active\n", chanid);
352 		} else if (ecc == MCH_E3_DIMM_ECC_ALL) {
353 			ecc_printf(sc, "channel%d, ECC active IO/logic\n",
354 			    chanid);
355 		} else {
356 			if (sc->ecc_ver == ECC_E3_VER_1 ||
357 			    sc->ecc_ver == ECC_E3_VER_2) {
358 				if (ecc == MCH_E3_DIMM_ECC_IO) {
359 					ecc_printf(sc, "channel%d, "
360 					    "ECC active IO\n", chanid);
361 				} else {
362 					ecc_printf(sc, "channel%d, "
363 					    "ECC active logic\n", chanid);
364 				}
365 			} else { /* v3 */
366 				ecc_printf(sc, "channel%d, "
367 				    "invalid ECC active 0x%x\n", chanid, ecc);
368 			}
369 		}
370 	}
371 
372 	chan->chan_id = chanid;
373 	chan->chan_errlog0 = errlog0;
374 
375 	rank = 0;
376 	for (dimm = 0; dimm < ECC_E3_CHAN_DIMM_MAX; ++dimm) {
377 		struct ecc_e3_dimm *dimm_sc;
378 		struct ecc_e3_rank *rk;
379 		struct ksensor *sens;
380 
381 		if (dimm_size[dimm] == 0)
382 			continue;
383 
384 		dimm_sc = kmalloc(sizeof(*dimm_sc), M_DEVBUF,
385 		    M_WAITOK | M_ZERO);
386 		dimm_sc->dimm_softc = dimm_create(0, chanid, dimm);
387 
388 		sens = &dimm_sc->dimm_sensor;
389 		ksnprintf(sens->desc, sizeof(sens->desc),
390 		    "node0 chan%d DIMM%d ecc", chanid, dimm);
391 		sens->type = SENSOR_ECC;
392 		sensor_set(sens, 0, SENSOR_S_OK);
393 		dimm_sensor_attach(dimm_sc->dimm_softc, sens);
394 
395 		TAILQ_INSERT_TAIL(&sc->ecc_dimm, dimm_sc, dimm_link);
396 
397 		KKASSERT(rank < ECC_E3_CHAN_RANK_MAX - 1);
398 		rk = &chan->chan_rank[rank];
399 		rank++;
400 		rk->rank_dimm_sc = dimm_sc;
401 		if (dimm_ch & dimm_dlrank[dimm]) {
402 			rk = &chan->chan_rank[rank];
403 			rank++;
404 			rk->rank_dimm_sc = dimm_sc;
405 		}
406 	}
407 	chan->chan_rank_cnt = rank;
408 }
409 
410 static void
411 ecc_e3_errlog(struct ecc_e3_softc *sc, boolean_t crit)
412 {
413 	int i;
414 
415 	for (i = 0; i < ECC_E3_CHAN_MAX; ++i) {
416 		struct ecc_e3_chan *chan = &sc->ecc_chan[i];
417 
418 		if (chan->chan_errlog0 != 0)
419 			ecc_e3_errlog_ch(sc, chan, crit);
420 	}
421 }
422 
423 static void
424 ecc_e3_errlog_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan,
425     boolean_t crit)
426 {
427 	uint32_t err0;
428 	int rank;
429 
430 	err0 = CSR_READ_4(sc, chan->chan_errlog0);
431 	if ((err0 & (MCH_E3_ERRLOG0_CERRSTS | MCH_E3_ERRLOG0_MERRSTS)) == 0)
432 		return;
433 
434 	rank = __SHIFTOUT(err0, MCH_E3_ERRLOG0_ERRRANK);
435 	if (rank >= chan->chan_rank_cnt) {
436 		ecc_printf(sc, "channel%d rank%d %serror\n", chan->chan_id,
437 		    rank, crit ? "critical " : "");
438 	} else {
439 		struct ecc_e3_dimm *dimm_sc;
440 
441 		dimm_sc = chan->chan_rank[rank].rank_dimm_sc;
442 		dimm_sensor_ecc_add(dimm_sc->dimm_softc, &dimm_sc->dimm_sensor,
443 		    1, crit);
444 	}
445 }
446 
447 static int
448 ecc_e3_detach(device_t dev)
449 {
450 	struct ecc_e3_softc *sc = device_get_softc(dev);
451 
452 	ecc_e3_stop(sc);
453 
454 	if (sc->ecc_parent != NULL) {
455 		struct ecc_e3_dimm *dimm_sc;
456 
457 		while ((dimm_sc = TAILQ_FIRST(&sc->ecc_dimm)) != NULL) {
458 			TAILQ_REMOVE(&sc->ecc_dimm, dimm_sc, dimm_link);
459 			dimm_sensor_detach(dimm_sc->dimm_softc,
460 			    &dimm_sc->dimm_sensor);
461 			dimm_destroy(dimm_sc->dimm_softc);
462 
463 			kfree(dimm_sc, M_DEVBUF);
464 		}
465 	} else {
466 		sensordev_deinstall(&sc->ecc_sensdev);
467 	}
468 	return 0;
469 }
470 
471 static void
472 ecc_e3_shutdown(device_t dev)
473 {
474 	ecc_e3_stop(device_get_softc(dev));
475 }
476 
477 static void
478 ecc_e3_stop(struct ecc_e3_softc *sc)
479 {
480 	if (sc->ecc_flags & ECC_E3_FLAG_SENSTASK)
481 		sensor_task_unregister(sc);
482 }
483 
484 static void
485 ecc_e3_sensor_update(struct ecc_e3_softc *sc, boolean_t crit)
486 {
487 	enum sensor_status status;
488 
489 	sc->ecc_count++;
490 	if (!crit && sc->ecc_count >= sc->ecc_thresh)
491 		crit = TRUE;
492 
493 	if (crit && (sc->ecc_flags & ECC_E3_FLAG_CRIT) == 0) {
494 		char ecc_str[16];
495 
496 		ksnprintf(ecc_str, sizeof(ecc_str), "%d", sc->ecc_count);
497 		devctl_notify("ecc", "ECC", ecc_str, "node=0");
498 
499 		ecc_printf(sc, "too many ECC errors %d\n", sc->ecc_count);
500 		sc->ecc_flags |= ECC_E3_FLAG_CRIT;
501 	}
502 
503 	if (sc->ecc_flags & ECC_E3_FLAG_CRIT)
504 		status = SENSOR_S_CRIT;
505 	else
506 		status = SENSOR_S_OK;
507 	sensor_set(&sc->ecc_sens, sc->ecc_count, status);
508 }
509