1 /* 2 * Copyright (c) 2011 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Sepherosa Ziehau <sepherosa@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/bus.h> 38 #include <sys/kernel.h> 39 #include <sys/malloc.h> 40 #include <sys/bitops.h> 41 #include <sys/sensors.h> 42 43 #include <bus/pci/pcivar.h> 44 #include <bus/pci/pcireg.h> 45 #include <bus/pci/pci_cfgreg.h> 46 47 #include <vm/pmap.h> 48 49 #include "coremctl_if.h" 50 #include "pcib_if.h" 51 52 #include <dev/misc/dimm/dimm.h> 53 #include <dev/misc/coremctl/coremctl_reg.h> 54 55 #define ECC_E3_VER_1 1 /* Sandy Bridge */ 56 #define ECC_E3_VER_2 2 /* Ivy Bridge */ 57 #define ECC_E3_VER_3 3 /* Haswell */ 58 59 #define ECC_E3_THRESH_DEFAULT 5 60 61 #define ECC_E3_CHAN_MAX 2 62 #define ECC_E3_CHAN_DIMM_MAX 2 63 #define ECC_E3_DIMM_RANK_MAX 2 64 #define ECC_E3_CHAN_RANK_MAX (ECC_E3_CHAN_DIMM_MAX * ECC_E3_DIMM_RANK_MAX) 65 66 struct ecc_e3_type { 67 uint16_t did; 68 const char *desc; 69 int ver; /* ECC_E3_VER_ */ 70 }; 71 72 struct ecc_e3_dimm { 73 TAILQ_ENTRY(ecc_e3_dimm) dimm_link; 74 struct dimm_softc *dimm_softc; 75 struct ksensor dimm_sensor; 76 }; 77 78 struct ecc_e3_rank { 79 struct ecc_e3_dimm *rank_dimm_sc; 80 }; 81 82 struct ecc_e3_chan { 83 int chan_id; 84 int chan_errlog0; 85 int chan_rank_cnt; 86 struct ecc_e3_rank chan_rank[ECC_E3_CHAN_RANK_MAX]; 87 }; 88 89 struct ecc_e3_softc { 90 device_t ecc_dev; 91 device_t ecc_parent; /* non-NULL if parent has MCHBAR */ 92 int ecc_ver; /* ECC_E3_VER_ */ 93 uint32_t ecc_flags; /* ECC_E3_FLAG_ */ 94 95 struct ecc_e3_chan ecc_chan[ECC_E3_CHAN_MAX]; 96 TAILQ_HEAD(, ecc_e3_dimm) ecc_dimm; 97 98 /* 99 * If the parent does not have MCHBAR, 100 * i.e. no DIMM location information 101 * for the ECC errors, fallback to the 102 * sensor and counters below. 103 */ 104 struct ksensordev ecc_sensdev; 105 struct ksensor ecc_sens; 106 int ecc_count; 107 int ecc_thresh; 108 }; 109 110 #define ECC_E3_FLAG_SENSTASK 0x1 111 #define ECC_E3_FLAG_CRIT 0x2 112 113 #define ecc_printf(sc, fmt, arg...) \ 114 device_printf((sc)->ecc_dev, fmt , ##arg) 115 116 static int ecc_e3_probe(device_t); 117 static int ecc_e3_attach(device_t); 118 static int ecc_e3_detach(device_t); 119 static void ecc_e3_shutdown(device_t); 120 121 static void ecc_e3_attach_ch(struct ecc_e3_softc *, struct ecc_e3_chan *, 122 int, uint32_t, int); 123 static void ecc_e3_errlog(struct ecc_e3_softc *, boolean_t); 124 static void ecc_e3_errlog_ch(struct ecc_e3_softc *, struct ecc_e3_chan *, 125 boolean_t); 126 static void ecc_e3_stop(struct ecc_e3_softc *); 127 128 static void ecc_e3_sensor_task(void *); 129 static void ecc_e3_sensor_update(struct ecc_e3_softc *, boolean_t); 130 131 static const struct ecc_e3_type ecc_e3_types[] = { 132 { PCI_E3V1_MEMCTL_DID, "Intel E3 ECC", ECC_E3_VER_1 }, 133 { PCI_E3V2_MEMCTL_DID, "Intel E3 v2 ECC", ECC_E3_VER_2 }, 134 { PCI_E3V3_MEMCTL_DID, "Intel E3 v3 ECC", ECC_E3_VER_3 }, 135 { 0, NULL, 0 } /* required last entry */ 136 }; 137 138 static device_method_t ecc_e3_methods[] = { 139 /* Device interface */ 140 DEVMETHOD(device_probe, ecc_e3_probe), 141 DEVMETHOD(device_attach, ecc_e3_attach), 142 DEVMETHOD(device_detach, ecc_e3_detach), 143 DEVMETHOD(device_shutdown, ecc_e3_shutdown), 144 DEVMETHOD(device_suspend, bus_generic_suspend), 145 DEVMETHOD(device_resume, bus_generic_resume), 146 DEVMETHOD_END 147 }; 148 149 static driver_t ecc_e3_driver = { 150 "ecc", 151 ecc_e3_methods, 152 sizeof(struct ecc_e3_softc) 153 }; 154 static devclass_t ecc_devclass; 155 DRIVER_MODULE(ecc_e3, coremctl, ecc_e3_driver, ecc_devclass, NULL, NULL); 156 MODULE_DEPEND(ecc_e3, pci, 1, 1, 1); 157 MODULE_DEPEND(ecc_e3, coremctl, 1, 1, 1); 158 159 static __inline uint32_t 160 CSR_READ_4(struct ecc_e3_softc *sc, int ofs) 161 { 162 uint32_t val; 163 int error; 164 165 error = COREMCTL_MCH_READ(sc->ecc_parent, ofs, &val); 166 KASSERT(!error, ("mch read failed")); 167 168 return val; 169 } 170 171 static int 172 ecc_e3_probe(device_t dev) 173 { 174 const struct ecc_e3_type *t; 175 uint16_t did; 176 177 if (pci_get_vendor(dev) != PCI_CORE_MEMCTL_VID) 178 return ENXIO; 179 180 did = pci_get_device(dev); 181 for (t = ecc_e3_types; t->desc != NULL; ++t) { 182 if (t->did == did) { 183 struct ecc_e3_softc *sc = device_get_softc(dev); 184 185 device_set_desc(dev, t->desc); 186 sc->ecc_ver = t->ver; 187 return 0; 188 } 189 } 190 return ENXIO; 191 } 192 193 static int 194 ecc_e3_attach(device_t dev) 195 { 196 struct ecc_e3_softc *sc = device_get_softc(dev); 197 uint32_t val; 198 int error; 199 200 TAILQ_INIT(&sc->ecc_dimm); 201 sc->ecc_dev = dev; 202 203 /* Probe the existance of MCHBAR */ 204 error = COREMCTL_MCH_READ(device_get_parent(dev), MCH_CORE_DIMM_CH0, 205 &val); 206 if (!error) 207 sc->ecc_parent = device_get_parent(dev); 208 209 if (sc->ecc_parent != NULL) { 210 uint32_t dimm_ch0, dimm_ch1; 211 int ecc_active; 212 213 if (bootverbose) { 214 ecc_printf(sc, "LOG0_C0 %#x\n", 215 CSR_READ_4(sc, MCH_E3_ERRLOG0_C0)); 216 ecc_printf(sc, "LOG0_C1 %#x\n", 217 CSR_READ_4(sc, MCH_E3_ERRLOG0_C1)); 218 } 219 220 dimm_ch0 = CSR_READ_4(sc, MCH_CORE_DIMM_CH0); 221 dimm_ch1 = CSR_READ_4(sc, MCH_CORE_DIMM_CH1); 222 223 ecc_e3_attach_ch(sc, &sc->ecc_chan[0], 0, dimm_ch0, 224 MCH_E3_ERRLOG0_C0); 225 ecc_e3_attach_ch(sc, &sc->ecc_chan[1], 1, dimm_ch1, 226 MCH_E3_ERRLOG0_C1); 227 228 ecc_active = 1; 229 if (sc->ecc_ver == ECC_E3_VER_1 || 230 sc->ecc_ver == ECC_E3_VER_2) { 231 if (((dimm_ch0 | dimm_ch1) & MCH_E3_DIMM_ECC) == 232 MCH_E3_DIMM_ECC_NONE) { 233 ecc_active = 0; 234 ecc_printf(sc, "No ECC active\n"); 235 } 236 } else { /* v3 */ 237 uint32_t ecc_mode0, ecc_mode1; 238 239 ecc_mode0 = __SHIFTOUT(dimm_ch0, MCH_E3_DIMM_ECC); 240 ecc_mode1 = __SHIFTOUT(dimm_ch1, MCH_E3_DIMM_ECC); 241 242 /* 243 * Only active ALL/NONE is supported 244 */ 245 246 if (ecc_mode0 != MCH_E3_DIMM_ECC_NONE && 247 ecc_mode0 != MCH_E3_DIMM_ECC_ALL) { 248 ecc_active = 0; 249 ecc_printf(sc, "channel0, invalid ECC " 250 "active 0x%x\n", ecc_mode0); 251 } 252 if (ecc_mode1 != MCH_E3_DIMM_ECC_NONE && 253 ecc_mode1 != MCH_E3_DIMM_ECC_ALL) { 254 ecc_active = 0; 255 ecc_printf(sc, "channel1, invalid ECC " 256 "active 0x%x\n", ecc_mode1); 257 } 258 259 if (ecc_mode0 == MCH_E3_DIMM_ECC_NONE && 260 ecc_mode1 == MCH_E3_DIMM_ECC_NONE) { 261 ecc_active = 0; 262 ecc_printf(sc, "No ECC active\n"); 263 } 264 } 265 266 if (!ecc_active) 267 return 0; 268 } else { 269 ecc_printf(sc, "MCHBAR is not enabled\n"); 270 271 /* 272 * Add hw.sensors.eccN.ecc0 MIB. 273 */ 274 strlcpy(sc->ecc_sensdev.xname, device_get_nameunit(dev), 275 sizeof(sc->ecc_sensdev.xname)); 276 strlcpy(sc->ecc_sens.desc, "node0 ecc", 277 sizeof(sc->ecc_sens.desc)); 278 sc->ecc_sens.type = SENSOR_ECC; 279 sensor_set(&sc->ecc_sens, 0, SENSOR_S_OK); 280 sensor_attach(&sc->ecc_sensdev, &sc->ecc_sens); 281 sensordev_install(&sc->ecc_sensdev); 282 283 sc->ecc_thresh = ECC_E3_THRESH_DEFAULT; 284 SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), 285 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), 286 OID_AUTO, "thresh", CTLFLAG_RW, &sc->ecc_thresh, 0, 287 "Raise alarm once number of ECC errors " 288 "goes above this value"); 289 } 290 291 sc->ecc_flags |= ECC_E3_FLAG_SENSTASK; 292 sensor_task_register(sc, ecc_e3_sensor_task, 1); 293 294 return 0; 295 } 296 297 static void 298 ecc_e3_sensor_task(void *xsc) 299 { 300 struct ecc_e3_softc *sc = xsc; 301 device_t dev = sc->ecc_dev; 302 uint16_t errsts; 303 304 errsts = pci_read_config(dev, PCI_E3_ERRSTS, 2); 305 if (errsts & (PCI_E3_ERRSTS_DSERR | PCI_E3_ERRSTS_DMERR)) { 306 boolean_t crit = FALSE; 307 308 if (errsts & PCI_E3_ERRSTS_DMERR) 309 crit = TRUE; 310 311 if (sc->ecc_parent != NULL) 312 ecc_e3_errlog(sc, crit); 313 else 314 ecc_e3_sensor_update(sc, crit); 315 316 /* Clear pending errors */ 317 pci_write_config(dev, PCI_E3_ERRSTS, errsts, 2); 318 } 319 } 320 321 static void 322 ecc_e3_attach_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan, 323 int chanid, uint32_t dimm_ch, int errlog0) 324 { 325 int dimm_size[ECC_E3_CHAN_DIMM_MAX]; 326 uint32_t dimm_szmask[ECC_E3_CHAN_DIMM_MAX]; 327 uint32_t dimm_dlrank[ECC_E3_CHAN_DIMM_MAX]; 328 int rank, dimm; 329 330 dimm_szmask[0] = MCH_CORE_DIMM_A_SIZE; 331 dimm_dlrank[0] = MCH_CORE_DIMM_A_DUAL_RANK; 332 dimm_szmask[1] = MCH_CORE_DIMM_B_SIZE; 333 dimm_dlrank[1] = MCH_CORE_DIMM_B_DUAL_RANK; 334 if (dimm_ch & MCH_CORE_DIMM_A_SELECT) { 335 dimm_szmask[0] = MCH_CORE_DIMM_B_SIZE; 336 dimm_dlrank[0] = MCH_CORE_DIMM_B_DUAL_RANK; 337 dimm_szmask[1] = MCH_CORE_DIMM_A_SIZE; 338 dimm_dlrank[1] = MCH_CORE_DIMM_A_DUAL_RANK; 339 } 340 341 dimm_size[0] = __SHIFTOUT(dimm_ch, dimm_szmask[0]); 342 dimm_size[1] = __SHIFTOUT(dimm_ch, dimm_szmask[1]); 343 if (dimm_size[0] == 0 && dimm_size[1] == 0) 344 return; 345 346 if (bootverbose) { 347 int ecc; 348 349 ecc = __SHIFTOUT(dimm_ch, MCH_E3_DIMM_ECC); 350 if (ecc == MCH_E3_DIMM_ECC_NONE) { 351 ecc_printf(sc, "channel%d, no ECC active\n", chanid); 352 } else if (ecc == MCH_E3_DIMM_ECC_ALL) { 353 ecc_printf(sc, "channel%d, ECC active IO/logic\n", 354 chanid); 355 } else { 356 if (sc->ecc_ver == ECC_E3_VER_1 || 357 sc->ecc_ver == ECC_E3_VER_2) { 358 if (ecc == MCH_E3_DIMM_ECC_IO) { 359 ecc_printf(sc, "channel%d, " 360 "ECC active IO\n", chanid); 361 } else { 362 ecc_printf(sc, "channel%d, " 363 "ECC active logic\n", chanid); 364 } 365 } else { /* v3 */ 366 ecc_printf(sc, "channel%d, " 367 "invalid ECC active 0x%x\n", chanid, ecc); 368 } 369 } 370 } 371 372 chan->chan_id = chanid; 373 chan->chan_errlog0 = errlog0; 374 375 rank = 0; 376 for (dimm = 0; dimm < ECC_E3_CHAN_DIMM_MAX; ++dimm) { 377 struct ecc_e3_dimm *dimm_sc; 378 struct ecc_e3_rank *rk; 379 struct ksensor *sens; 380 381 if (dimm_size[dimm] == 0) 382 continue; 383 384 dimm_sc = kmalloc(sizeof(*dimm_sc), M_DEVBUF, 385 M_WAITOK | M_ZERO); 386 dimm_sc->dimm_softc = dimm_create(0, chanid, dimm); 387 388 sens = &dimm_sc->dimm_sensor; 389 ksnprintf(sens->desc, sizeof(sens->desc), 390 "node0 chan%d DIMM%d ecc", chanid, dimm); 391 sens->type = SENSOR_ECC; 392 sensor_set(sens, 0, SENSOR_S_OK); 393 dimm_sensor_attach(dimm_sc->dimm_softc, sens); 394 395 TAILQ_INSERT_TAIL(&sc->ecc_dimm, dimm_sc, dimm_link); 396 397 KKASSERT(rank < ECC_E3_CHAN_RANK_MAX - 1); 398 rk = &chan->chan_rank[rank]; 399 rank++; 400 rk->rank_dimm_sc = dimm_sc; 401 if (dimm_ch & dimm_dlrank[dimm]) { 402 rk = &chan->chan_rank[rank]; 403 rank++; 404 rk->rank_dimm_sc = dimm_sc; 405 } 406 } 407 chan->chan_rank_cnt = rank; 408 } 409 410 static void 411 ecc_e3_errlog(struct ecc_e3_softc *sc, boolean_t crit) 412 { 413 int i; 414 415 for (i = 0; i < ECC_E3_CHAN_MAX; ++i) { 416 struct ecc_e3_chan *chan = &sc->ecc_chan[i]; 417 418 if (chan->chan_errlog0 != 0) 419 ecc_e3_errlog_ch(sc, chan, crit); 420 } 421 } 422 423 static void 424 ecc_e3_errlog_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan, 425 boolean_t crit) 426 { 427 uint32_t err0; 428 int rank; 429 430 err0 = CSR_READ_4(sc, chan->chan_errlog0); 431 if ((err0 & (MCH_E3_ERRLOG0_CERRSTS | MCH_E3_ERRLOG0_MERRSTS)) == 0) 432 return; 433 434 rank = __SHIFTOUT(err0, MCH_E3_ERRLOG0_ERRRANK); 435 if (rank >= chan->chan_rank_cnt) { 436 ecc_printf(sc, "channel%d rank%d %serror\n", chan->chan_id, 437 rank, crit ? "critical " : ""); 438 } else { 439 struct ecc_e3_dimm *dimm_sc; 440 441 dimm_sc = chan->chan_rank[rank].rank_dimm_sc; 442 dimm_sensor_ecc_add(dimm_sc->dimm_softc, &dimm_sc->dimm_sensor, 443 1, crit); 444 } 445 } 446 447 static int 448 ecc_e3_detach(device_t dev) 449 { 450 struct ecc_e3_softc *sc = device_get_softc(dev); 451 452 ecc_e3_stop(sc); 453 454 if (sc->ecc_parent != NULL) { 455 struct ecc_e3_dimm *dimm_sc; 456 457 while ((dimm_sc = TAILQ_FIRST(&sc->ecc_dimm)) != NULL) { 458 TAILQ_REMOVE(&sc->ecc_dimm, dimm_sc, dimm_link); 459 dimm_sensor_detach(dimm_sc->dimm_softc, 460 &dimm_sc->dimm_sensor); 461 dimm_destroy(dimm_sc->dimm_softc); 462 463 kfree(dimm_sc, M_DEVBUF); 464 } 465 } else { 466 sensordev_deinstall(&sc->ecc_sensdev); 467 } 468 return 0; 469 } 470 471 static void 472 ecc_e3_shutdown(device_t dev) 473 { 474 ecc_e3_stop(device_get_softc(dev)); 475 } 476 477 static void 478 ecc_e3_stop(struct ecc_e3_softc *sc) 479 { 480 if (sc->ecc_flags & ECC_E3_FLAG_SENSTASK) 481 sensor_task_unregister(sc); 482 } 483 484 static void 485 ecc_e3_sensor_update(struct ecc_e3_softc *sc, boolean_t crit) 486 { 487 enum sensor_status status; 488 489 sc->ecc_count++; 490 if (!crit && sc->ecc_count >= sc->ecc_thresh) 491 crit = TRUE; 492 493 if (crit && (sc->ecc_flags & ECC_E3_FLAG_CRIT) == 0) { 494 char ecc_str[16]; 495 496 ksnprintf(ecc_str, sizeof(ecc_str), "%d", sc->ecc_count); 497 devctl_notify("ecc", "ECC", ecc_str, "node=0"); 498 499 ecc_printf(sc, "too many ECC errors %d\n", sc->ecc_count); 500 sc->ecc_flags |= ECC_E3_FLAG_CRIT; 501 } 502 503 if (sc->ecc_flags & ECC_E3_FLAG_CRIT) 504 status = SENSOR_S_CRIT; 505 else 506 status = SENSOR_S_OK; 507 sensor_set(&sc->ecc_sens, sc->ecc_count, status); 508 } 509