xref: /dragonfly/sys/dev/misc/ecc/ecc_e5.c (revision 7f38fe7b)
1 /*
2  * Copyright (c) 2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Sepherosa Ziehau <sepherosa@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/bitops.h>
38 #include <sys/bus.h>
39 #include <sys/cpu_topology.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/queue.h>
43 #include <sys/sensors.h>
44 
45 #include <bus/pci/pcivar.h>
46 #include <bus/pci/pcireg.h>
47 #include <bus/pci/pci_cfgreg.h>
48 #include <bus/pci/pcib_private.h>
49 
50 #include "pcib_if.h"
51 
52 #include <dev/misc/dimm/dimm.h>
53 #include <dev/misc/ecc/e5_imc_reg.h>
54 #include <dev/misc/ecc/e5_imc_var.h>
55 
56 struct ecc_e5_dimm {
57 	TAILQ_ENTRY(ecc_e5_dimm) dimm_link;
58 	struct dimm_softc	*dimm_softc;
59 	struct ksensor		dimm_sensor;
60 };
61 
62 struct ecc_e5_rank {
63 	struct ecc_e5_dimm *rank_dimm_sc;
64 };
65 
66 struct ecc_e5_softc {
67 	device_t		ecc_dev;
68 	const struct e5_imc_chan *ecc_chan;
69 	int			ecc_node;
70 	int			ecc_rank_cnt;
71 	struct ecc_e5_rank	ecc_rank[PCI_E5_IMC_ERROR_RANK_MAX];
72 	struct sensor_task	*ecc_senstask;
73 	TAILQ_HEAD(, ecc_e5_dimm) ecc_dimm;
74 };
75 
76 #define ecc_printf(sc, fmt, arg...) \
77 	device_printf((sc)->ecc_dev, fmt , ##arg)
78 
79 static int	ecc_e5_probe(device_t);
80 static int	ecc_e5_attach(device_t);
81 static int	ecc_e5_detach(device_t);
82 static void	ecc_e5_shutdown(device_t);
83 
84 static void	ecc_e5_sensor_task(void *);
85 
86 #define ECC_E5_CHAN(v, imc, c, c_ext)				\
87 {								\
88 	.did		= PCI_E5V##v##_IMC##imc##_ERROR_CHN##c##_DID_ID, \
89 	.slot		= PCISLOT_E5V##v##_IMC##imc##_ERROR_CHN##c, \
90 	.func		= PCIFUNC_E5V##v##_IMC##imc##_ERROR_CHN##c, \
91 	.desc		= "Intel E5 v" #v " ECC",		\
92 								\
93 	E5_IMC_CHAN_FIELDS(v, imc, c, c_ext)			\
94 }
95 
96 #define ECC_E5_CHAN_V2(c)		ECC_E5_CHAN(2, 0, c, c)
97 #define ECC_E5_CHAN_IMC0_V3(c)		ECC_E5_CHAN(3, 0, c, c)
98 #define ECC_E5_CHAN_IMC1_V3(c, c_ext)	ECC_E5_CHAN(3, 1, c, c_ext)
99 #define ECC_E5_CHAN_END			E5_IMC_CHAN_END
100 
101 static const struct e5_imc_chan ecc_e5_chans[] = {
102 	ECC_E5_CHAN_V2(0),
103 	ECC_E5_CHAN_V2(1),
104 	ECC_E5_CHAN_V2(2),
105 	ECC_E5_CHAN_V2(3),
106 
107 	ECC_E5_CHAN_IMC0_V3(0),
108 	ECC_E5_CHAN_IMC0_V3(1),
109 	ECC_E5_CHAN_IMC0_V3(2),
110 	ECC_E5_CHAN_IMC0_V3(3),
111 	ECC_E5_CHAN_IMC1_V3(0, 2),	/* IMC1 chan0 -> channel2 */
112 	ECC_E5_CHAN_IMC1_V3(1, 3),	/* IMC1 chan1 -> channel3 */
113 
114 	ECC_E5_CHAN_END
115 };
116 
117 #undef ECC_E5_CHAN_END
118 #undef ECC_E5_CHAN_V2
119 #undef ECC_E5_CHAN
120 
121 static device_method_t ecc_e5_methods[] = {
122 	/* Device interface */
123 	DEVMETHOD(device_probe,		ecc_e5_probe),
124 	DEVMETHOD(device_attach,	ecc_e5_attach),
125 	DEVMETHOD(device_detach,	ecc_e5_detach),
126 	DEVMETHOD(device_shutdown,	ecc_e5_shutdown),
127 	DEVMETHOD(device_suspend,	bus_generic_suspend),
128 	DEVMETHOD(device_resume,	bus_generic_resume),
129 	DEVMETHOD_END
130 };
131 
132 static driver_t ecc_e5_driver = {
133 	"ecc",
134 	ecc_e5_methods,
135 	sizeof(struct ecc_e5_softc)
136 };
137 static devclass_t ecc_devclass;
138 DRIVER_MODULE(ecc_e5, pci, ecc_e5_driver, ecc_devclass, NULL, NULL);
139 MODULE_DEPEND(ecc_e5, pci, 1, 1, 1);
140 MODULE_DEPEND(ecc_e5, dimm, 1, 1, 1);
141 
142 static int
143 ecc_e5_probe(device_t dev)
144 {
145 	const struct e5_imc_chan *c;
146 	uint16_t vid, did;
147 	int slot, func;
148 
149 	vid = pci_get_vendor(dev);
150 	if (vid != PCI_E5_IMC_VID_ID)
151 		return ENXIO;
152 
153 	did = pci_get_device(dev);
154 	slot = pci_get_slot(dev);
155 	func = pci_get_function(dev);
156 
157 	for (c = ecc_e5_chans; c->desc != NULL; ++c) {
158 		if (c->did == did && c->slot == slot && c->func == func) {
159 			struct ecc_e5_softc *sc = device_get_softc(dev);
160 			int node;
161 
162 			node = e5_imc_node_probe(dev, c);
163 			if (node < 0)
164 				break;
165 
166 			device_set_desc(dev, c->desc);
167 
168 			sc->ecc_chan = c;
169 			sc->ecc_node = node;
170 			return 0;
171 		}
172 	}
173 	return ENXIO;
174 }
175 
176 static int
177 ecc_e5_attach(device_t dev)
178 {
179 	struct ecc_e5_softc *sc = device_get_softc(dev);
180 	int dimm, rank, error, cpuid;
181 	const cpu_node_t *node;
182 	uint32_t mcmtr;
183 
184 	TAILQ_INIT(&sc->ecc_dimm);
185 	sc->ecc_dev = dev;
186 
187 	mcmtr = IMC_CPGC_READ_4(sc->ecc_dev, sc->ecc_chan,
188 	    PCI_E5_IMC_CPGC_MCMTR);
189 	if (bootverbose) {
190 		if (sc->ecc_chan->ver == E5_IMC_CHAN_VER3 &&
191 		    (mcmtr & PCI_E5V3_IMC_CPGC_MCMTR_DDR4))
192 			ecc_printf(sc, "DDR4\n");
193 		if (__SHIFTOUT(mcmtr, PCI_E5_IMC_CPGC_MCMTR_IMC_MODE) ==
194 		    PCI_E5_IMC_CPGC_MCMTR_IMC_MODE_DDR3) {
195 			ecc_printf(sc, "native %s\n",
196 			    sc->ecc_chan->ver == E5_IMC_CHAN_VER2 ?
197 			    "DDR3" : "DDR");
198 		}
199 	}
200 
201 	rank = 0;
202 	for (dimm = 0; dimm < PCI_E5_IMC_CHN_DIMM_MAX; ++dimm) {
203 		struct ecc_e5_dimm *dimm_sc;
204 		struct ksensor *sens;
205 		const char *width;
206 		uint32_t dimmmtr;
207 		int rank_cnt, r;
208 		int density;
209 		int val;
210 
211 		dimmmtr = IMC_CTAD_READ_4(sc->ecc_dev, sc->ecc_chan,
212 		    PCI_E5_IMC_CTAD_DIMMMTR(dimm));
213 
214 		if ((dimmmtr & PCI_E5_IMC_CTAD_DIMMMTR_DIMM_POP) == 0)
215 			continue;
216 
217 		val = __SHIFTOUT(dimmmtr, PCI_E5_IMC_CTAD_DIMMMTR_RANK_CNT);
218 		switch (val) {
219 		case PCI_E5_IMC_CTAD_DIMMMTR_RANK_CNT_SR:
220 			rank_cnt = 1;
221 			break;
222 		case PCI_E5_IMC_CTAD_DIMMMTR_RANK_CNT_DR:
223 			rank_cnt = 2;
224 			break;
225 		case PCI_E5_IMC_CTAD_DIMMMTR_RANK_CNT_QR:
226 			rank_cnt = 4;
227 			break;
228 		case PCI_E5V3_IMC_CTAD_DIMMMTR_RANK_CNT_8R:
229 			if (sc->ecc_chan->ver >= E5_IMC_CHAN_VER3) {
230 				rank_cnt = 8;
231 				break;
232 			}
233 			/* FALL THROUGH */
234 		default:
235 			ecc_printf(sc, "unknown rank count 0x%x\n", val);
236 			error = ENXIO;
237 			goto failed;
238 		}
239 
240 		val = __SHIFTOUT(dimmmtr, PCI_E5_IMC_CTAD_DIMMMTR_DDR3_WIDTH);
241 		switch (val) {
242 		case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_WIDTH_4:
243 			width = "x4";
244 			break;
245 		case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_WIDTH_8:
246 			width = "x8";
247 			break;
248 		case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_WIDTH_16:
249 			width = "x16";
250 			break;
251 		default:
252 			ecc_printf(sc, "unknown ddr3 width 0x%x\n", val);
253 			error = ENXIO;
254 			goto failed;
255 		}
256 
257 		val = __SHIFTOUT(dimmmtr, PCI_E5_IMC_CTAD_DIMMMTR_DDR3_DNSTY);
258 		switch (val) {
259 		case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_DNSTY_2G:
260 			density = 2;
261 			break;
262 		case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_DNSTY_4G:
263 			density = 4;
264 			break;
265 		case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_DNSTY_8G:
266 			density = 8;
267 			break;
268 		case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_DNSTY_1G:
269 			if (sc->ecc_chan->ver < E5_IMC_CHAN_VER3) {
270 				density = 1;
271 				break;
272 			}
273 			/* FALL THROUGH */
274 		default:
275 			ecc_printf(sc, "unknown ddr3 density 0x%x\n", val);
276 			error = ENXIO;
277 			goto failed;
278 		}
279 
280 		if (bootverbose) {
281 			ecc_printf(sc, "DIMM%d %dGB, %d%s, density %dGB\n",
282 			    dimm, density * rank_cnt * 2,
283 			    rank_cnt, width, density);
284 		}
285 
286 		dimm_sc = kmalloc(sizeof(*dimm_sc), M_DEVBUF,
287 		    M_WAITOK | M_ZERO);
288 		dimm_sc->dimm_softc =
289 		    dimm_create(sc->ecc_node, sc->ecc_chan->chan_ext, dimm);
290 
291 		sens = &dimm_sc->dimm_sensor;
292 		ksnprintf(sens->desc, sizeof(sens->desc),
293 		    "node%d chan%d DIMM%d ecc",
294 		    sc->ecc_node, sc->ecc_chan->chan_ext, dimm);
295 		sens->type = SENSOR_ECC;
296 		sensor_set(sens, 0, SENSOR_S_OK);
297 		dimm_sensor_attach(dimm_sc->dimm_softc, sens);
298 
299 		TAILQ_INSERT_TAIL(&sc->ecc_dimm, dimm_sc, dimm_link);
300 
301 		for (r = 0; r < rank_cnt; ++r) {
302 			struct ecc_e5_rank *rk;
303 
304 			if (rank >= PCI_E5_IMC_ERROR_RANK_MAX) {
305 				ecc_printf(sc, "too many ranks\n");
306 				error = ENXIO;
307 				goto failed;
308 			}
309 
310 			rk = &sc->ecc_rank[rank];
311 			rk->rank_dimm_sc = dimm_sc;
312 			++rank;
313 		}
314 	}
315 	sc->ecc_rank_cnt = rank;
316 
317 	if ((mcmtr & PCI_E5_IMC_CPGC_MCMTR_ECC_EN) == 0) {
318 		ecc_printf(sc, "ECC is not enabled\n");
319 		return 0;
320 	}
321 
322 	for (rank = 0; rank < sc->ecc_rank_cnt; ++rank) {
323 		const struct ecc_e5_rank *rk = &sc->ecc_rank[rank];
324 		uint32_t thr, mask;
325 		int ofs;
326 
327 		ofs = PCI_E5_IMC_ERROR_COR_ERR_TH(rank / 2);
328 		if (rank & 1)
329 			mask = PCI_E5_IMC_ERROR_COR_ERR_TH_HI;
330 		else
331 			mask = PCI_E5_IMC_ERROR_COR_ERR_TH_LO;
332 
333 		thr = pci_read_config(sc->ecc_dev, ofs, 4);
334 		dimm_set_ecc_thresh(rk->rank_dimm_sc->dimm_softc,
335 		    __SHIFTOUT(thr, mask));
336 	}
337 
338 	cpuid = -1;
339 	node = get_cpu_node_by_chipid(sc->ecc_node);
340 	if (node != NULL && node->child_no > 0) {
341 		cpuid = BSRCPUMASK(node->members);
342 		if (bootverbose) {
343 			device_printf(dev, "node%d chan%d -> cpu%d\n",
344 			    sc->ecc_node, sc->ecc_chan->chan_ext, cpuid);
345 		}
346 	}
347 	sc->ecc_senstask = sensor_task_register2(sc, ecc_e5_sensor_task,
348 	    1, cpuid);
349 
350 	return 0;
351 failed:
352 	ecc_e5_detach(dev);
353 	return error;
354 }
355 
356 static void
357 ecc_e5_sensor_task(void *xsc)
358 {
359 	struct ecc_e5_softc *sc = xsc;
360 	uint32_t err_ranks, val;
361 
362 	val = pci_read_config(sc->ecc_dev, PCI_E5_IMC_ERROR_COR_ERR_STAT, 4);
363 
364 	err_ranks = (val & PCI_E5_IMC_ERROR_COR_ERR_STAT_RANKS);
365 	while (err_ranks != 0) {
366 		int rank;
367 
368 		rank = ffs(err_ranks) - 1;
369 		err_ranks &= ~(1 << rank);
370 
371 		if (rank < sc->ecc_rank_cnt) {
372 			const struct ecc_e5_rank *rk = &sc->ecc_rank[rank];
373 			struct ecc_e5_dimm *dimm_sc = rk->rank_dimm_sc;
374 			uint32_t err, mask;
375 			int ofs, ecc_cnt;
376 
377 			ofs = PCI_E5_IMC_ERROR_COR_ERR_CNT(rank / 2);
378 			if (rank & 1)
379 				mask = PCI_E5_IMC_ERROR_COR_ERR_CNT_HI;
380 			else
381 				mask = PCI_E5_IMC_ERROR_COR_ERR_CNT_LO;
382 
383 			err = pci_read_config(sc->ecc_dev, ofs, 4);
384 			ecc_cnt = __SHIFTOUT(err, mask);
385 
386 			dimm_sensor_ecc_set(dimm_sc->dimm_softc,
387 			    &dimm_sc->dimm_sensor, ecc_cnt, TRUE);
388 		} else {
389 			ecc_printf(sc, "channel%d rank%d critical error\n",
390 			    sc->ecc_chan->chan_ext, rank);
391 		}
392 	}
393 
394 	if (val & PCI_E5_IMC_ERROR_COR_ERR_STAT_RANKS) {
395 		pci_write_config(sc->ecc_dev, PCI_E5_IMC_ERROR_COR_ERR_STAT,
396 		    val, 4);
397 	}
398 }
399 
400 static void
401 ecc_e5_stop(device_t dev)
402 {
403 	struct ecc_e5_softc *sc = device_get_softc(dev);
404 
405 	if (sc->ecc_senstask != NULL) {
406 		sensor_task_unregister2(sc->ecc_senstask);
407 		sc->ecc_senstask = NULL;
408 	}
409 }
410 
411 static int
412 ecc_e5_detach(device_t dev)
413 {
414 	struct ecc_e5_softc *sc = device_get_softc(dev);
415 	struct ecc_e5_dimm *dimm_sc;
416 
417 	ecc_e5_stop(dev);
418 
419 	while ((dimm_sc = TAILQ_FIRST(&sc->ecc_dimm)) != NULL) {
420 		TAILQ_REMOVE(&sc->ecc_dimm, dimm_sc, dimm_link);
421 		dimm_sensor_detach(dimm_sc->dimm_softc, &dimm_sc->dimm_sensor);
422 		dimm_destroy(dimm_sc->dimm_softc);
423 
424 		kfree(dimm_sc, M_DEVBUF);
425 	}
426 	return 0;
427 }
428 
429 static void
430 ecc_e5_shutdown(device_t dev)
431 {
432 	ecc_e5_stop(dev);
433 }
434