1 /* 2 * Copyright (c) 2011 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Sepherosa Ziehau <sepherosa@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/bus.h> 38 #include <sys/kernel.h> 39 #include <sys/malloc.h> 40 #include <sys/bitops.h> 41 #include <sys/sensors.h> 42 43 #include <bus/pci/pcivar.h> 44 #include <bus/pci/pcireg.h> 45 #include <bus/pci/pci_cfgreg.h> 46 47 #include <vm/pmap.h> 48 49 #include "coremctl_if.h" 50 #include "pcib_if.h" 51 52 #include <dev/misc/dimm/dimm.h> 53 #include <dev/misc/coremctl/coremctl_reg.h> 54 55 #define ECC_E3_VER_1 1 /* Sandy Bridge */ 56 #define ECC_E3_VER_2 2 /* Ivy Bridge */ 57 #define ECC_E3_VER_3 3 /* Haswell */ 58 59 #define ECC_E3_THRESH_DEFAULT 5 60 61 #define ECC_E3_CHAN_MAX 2 62 #define ECC_E3_CHAN_DIMM_MAX 2 63 #define ECC_E3_DIMM_RANK_MAX 2 64 #define ECC_E3_CHAN_RANK_MAX (ECC_E3_CHAN_DIMM_MAX * ECC_E3_DIMM_RANK_MAX) 65 66 struct ecc_e3_type { 67 uint16_t did; 68 const char *desc; 69 int ver; /* ECC_E3_VER_ */ 70 }; 71 72 struct ecc_e3_dimm { 73 TAILQ_ENTRY(ecc_e3_dimm) dimm_link; 74 struct dimm_softc *dimm_softc; 75 struct ksensor dimm_sensor; 76 }; 77 78 struct ecc_e3_rank { 79 struct ecc_e3_dimm *rank_dimm_sc; 80 }; 81 82 struct ecc_e3_chan { 83 int chan_id; 84 int chan_errlog0; 85 int chan_rank_cnt; 86 struct ecc_e3_rank chan_rank[ECC_E3_CHAN_RANK_MAX]; 87 }; 88 89 struct ecc_e3_softc { 90 device_t ecc_dev; 91 device_t ecc_parent; /* non-NULL if parent has MCHBAR */ 92 int ecc_ver; /* ECC_E3_VER_ */ 93 uint32_t ecc_flags; /* ECC_E3_FLAG_ */ 94 95 struct ecc_e3_chan ecc_chan[ECC_E3_CHAN_MAX]; 96 TAILQ_HEAD(, ecc_e3_dimm) ecc_dimm; 97 98 /* 99 * If the parent does not have MCHBAR, 100 * i.e. no DIMM location information 101 * for the ECC errors, fallback to the 102 * sensor and counters below. 103 */ 104 struct ksensordev ecc_sensdev; 105 struct ksensor ecc_sens; 106 int ecc_count; 107 int ecc_thresh; 108 }; 109 110 #define ECC_E3_FLAG_SENSTASK 0x1 111 #define ECC_E3_FLAG_CRIT 0x2 112 113 #define ecc_printf(sc, fmt, arg...) \ 114 device_printf((sc)->ecc_dev, fmt , ##arg) 115 116 static int ecc_e3_probe(device_t); 117 static int ecc_e3_attach(device_t); 118 static int ecc_e3_detach(device_t); 119 static void ecc_e3_shutdown(device_t); 120 121 static void ecc_e3_attach_ch(struct ecc_e3_softc *, struct ecc_e3_chan *, 122 int, uint32_t, int); 123 static void ecc_e3_errlog(struct ecc_e3_softc *, boolean_t); 124 static void ecc_e3_errlog_ch(struct ecc_e3_softc *, struct ecc_e3_chan *, 125 boolean_t); 126 static void ecc_e3_stop(struct ecc_e3_softc *); 127 128 static void ecc_e3_sensor_task(void *); 129 static void ecc_e3_sensor_update(struct ecc_e3_softc *, boolean_t); 130 131 static const struct ecc_e3_type ecc_e3_types[] = { 132 { PCI_E3V1_MEMCTL_DID, "Intel E3 ECC", ECC_E3_VER_1 }, 133 { PCI_E3V2_MEMCTL_DID, "Intel E3 v2 ECC", ECC_E3_VER_2 }, 134 { PCI_E3V3_MEMCTL_DID, "Intel E3 v3 ECC", ECC_E3_VER_3 }, 135 { 0, NULL, 0 } /* required last entry */ 136 }; 137 138 static device_method_t ecc_e3_methods[] = { 139 /* Device interface */ 140 DEVMETHOD(device_probe, ecc_e3_probe), 141 DEVMETHOD(device_attach, ecc_e3_attach), 142 DEVMETHOD(device_detach, ecc_e3_detach), 143 DEVMETHOD(device_shutdown, ecc_e3_shutdown), 144 DEVMETHOD(device_suspend, bus_generic_suspend), 145 DEVMETHOD(device_resume, bus_generic_resume), 146 DEVMETHOD_END 147 }; 148 149 static driver_t ecc_e3_driver = { 150 "ecc", 151 ecc_e3_methods, 152 sizeof(struct ecc_e3_softc) 153 }; 154 static devclass_t ecc_devclass; 155 DRIVER_MODULE(ecc_e3, coremctl, ecc_e3_driver, ecc_devclass, NULL, NULL); 156 MODULE_DEPEND(ecc_e3, pci, 1, 1, 1); 157 MODULE_DEPEND(ecc_e3, coremctl, 1, 1, 1); 158 MODULE_VERSION(ecc_e3, 1); 159 160 static __inline uint32_t 161 CSR_READ_4(struct ecc_e3_softc *sc, int ofs) 162 { 163 uint32_t val; 164 int error; 165 166 error = COREMCTL_MCH_READ(sc->ecc_parent, ofs, &val); 167 KASSERT(!error, ("mch read failed")); 168 169 return val; 170 } 171 172 static int 173 ecc_e3_probe(device_t dev) 174 { 175 const struct ecc_e3_type *t; 176 uint16_t did; 177 178 if (pci_get_vendor(dev) != PCI_CORE_MEMCTL_VID) 179 return ENXIO; 180 181 did = pci_get_device(dev); 182 for (t = ecc_e3_types; t->desc != NULL; ++t) { 183 if (t->did == did) { 184 struct ecc_e3_softc *sc = device_get_softc(dev); 185 186 device_set_desc(dev, t->desc); 187 sc->ecc_ver = t->ver; 188 return 0; 189 } 190 } 191 return ENXIO; 192 } 193 194 static int 195 ecc_e3_attach(device_t dev) 196 { 197 struct ecc_e3_softc *sc = device_get_softc(dev); 198 uint32_t val; 199 int error; 200 201 TAILQ_INIT(&sc->ecc_dimm); 202 sc->ecc_dev = dev; 203 204 /* Probe the existance of MCHBAR */ 205 error = COREMCTL_MCH_READ(device_get_parent(dev), MCH_CORE_DIMM_CH0, 206 &val); 207 if (!error) 208 sc->ecc_parent = device_get_parent(dev); 209 210 if (sc->ecc_parent != NULL) { 211 uint32_t dimm_ch0, dimm_ch1; 212 int ecc_active; 213 214 if (bootverbose) { 215 ecc_printf(sc, "LOG0_C0 %#x\n", 216 CSR_READ_4(sc, MCH_E3_ERRLOG0_C0)); 217 ecc_printf(sc, "LOG0_C1 %#x\n", 218 CSR_READ_4(sc, MCH_E3_ERRLOG0_C1)); 219 } 220 221 dimm_ch0 = CSR_READ_4(sc, MCH_CORE_DIMM_CH0); 222 dimm_ch1 = CSR_READ_4(sc, MCH_CORE_DIMM_CH1); 223 224 ecc_e3_attach_ch(sc, &sc->ecc_chan[0], 0, dimm_ch0, 225 MCH_E3_ERRLOG0_C0); 226 ecc_e3_attach_ch(sc, &sc->ecc_chan[1], 1, dimm_ch1, 227 MCH_E3_ERRLOG0_C1); 228 229 ecc_active = 1; 230 if (sc->ecc_ver == ECC_E3_VER_1 || 231 sc->ecc_ver == ECC_E3_VER_2) { 232 if (((dimm_ch0 | dimm_ch1) & MCH_E3_DIMM_ECC) == 233 MCH_E3_DIMM_ECC_NONE) { 234 ecc_active = 0; 235 ecc_printf(sc, "No ECC active\n"); 236 } 237 } else { /* v3 */ 238 uint32_t ecc_mode0, ecc_mode1; 239 240 ecc_mode0 = __SHIFTOUT(dimm_ch0, MCH_E3_DIMM_ECC); 241 ecc_mode1 = __SHIFTOUT(dimm_ch1, MCH_E3_DIMM_ECC); 242 243 /* 244 * Only active ALL/NONE is supported 245 */ 246 247 if (ecc_mode0 != MCH_E3_DIMM_ECC_NONE && 248 ecc_mode0 != MCH_E3_DIMM_ECC_ALL) { 249 ecc_active = 0; 250 ecc_printf(sc, "channel0, invalid ECC " 251 "active 0x%x\n", ecc_mode0); 252 } 253 if (ecc_mode1 != MCH_E3_DIMM_ECC_NONE && 254 ecc_mode1 != MCH_E3_DIMM_ECC_ALL) { 255 ecc_active = 0; 256 ecc_printf(sc, "channel1, invalid ECC " 257 "active 0x%x\n", ecc_mode1); 258 } 259 260 if (ecc_mode0 == MCH_E3_DIMM_ECC_NONE && 261 ecc_mode1 == MCH_E3_DIMM_ECC_NONE) { 262 ecc_active = 0; 263 ecc_printf(sc, "No ECC active\n"); 264 } 265 } 266 267 if (!ecc_active) 268 return 0; 269 } else { 270 ecc_printf(sc, "MCHBAR is not enabled\n"); 271 272 /* 273 * Add hw.sensors.eccN.ecc0 MIB. 274 */ 275 strlcpy(sc->ecc_sensdev.xname, device_get_nameunit(dev), 276 sizeof(sc->ecc_sensdev.xname)); 277 strlcpy(sc->ecc_sens.desc, "node0 ecc", 278 sizeof(sc->ecc_sens.desc)); 279 sc->ecc_sens.type = SENSOR_ECC; 280 sensor_set(&sc->ecc_sens, 0, SENSOR_S_OK); 281 sensor_attach(&sc->ecc_sensdev, &sc->ecc_sens); 282 sensordev_install(&sc->ecc_sensdev); 283 284 sc->ecc_thresh = ECC_E3_THRESH_DEFAULT; 285 SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), 286 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), 287 OID_AUTO, "thresh", CTLFLAG_RW, &sc->ecc_thresh, 0, 288 "Raise alarm once number of ECC errors " 289 "goes above this value"); 290 } 291 292 sc->ecc_flags |= ECC_E3_FLAG_SENSTASK; 293 sensor_task_register(sc, ecc_e3_sensor_task, 1); 294 295 return 0; 296 } 297 298 static void 299 ecc_e3_sensor_task(void *xsc) 300 { 301 struct ecc_e3_softc *sc = xsc; 302 device_t dev = sc->ecc_dev; 303 uint16_t errsts; 304 305 errsts = pci_read_config(dev, PCI_E3_ERRSTS, 2); 306 if (errsts & (PCI_E3_ERRSTS_DSERR | PCI_E3_ERRSTS_DMERR)) { 307 boolean_t crit = FALSE; 308 309 if (errsts & PCI_E3_ERRSTS_DMERR) 310 crit = TRUE; 311 312 if (sc->ecc_parent != NULL) 313 ecc_e3_errlog(sc, crit); 314 else 315 ecc_e3_sensor_update(sc, crit); 316 317 /* Clear pending errors */ 318 pci_write_config(dev, PCI_E3_ERRSTS, errsts, 2); 319 } 320 } 321 322 static void 323 ecc_e3_attach_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan, 324 int chanid, uint32_t dimm_ch, int errlog0) 325 { 326 int dimm_size[ECC_E3_CHAN_DIMM_MAX]; 327 uint32_t dimm_szmask[ECC_E3_CHAN_DIMM_MAX]; 328 uint32_t dimm_dlrank[ECC_E3_CHAN_DIMM_MAX]; 329 int rank, dimm; 330 331 dimm_szmask[0] = MCH_CORE_DIMM_A_SIZE; 332 dimm_dlrank[0] = MCH_CORE_DIMM_A_DUAL_RANK; 333 dimm_szmask[1] = MCH_CORE_DIMM_B_SIZE; 334 dimm_dlrank[1] = MCH_CORE_DIMM_B_DUAL_RANK; 335 if (dimm_ch & MCH_CORE_DIMM_A_SELECT) { 336 dimm_szmask[0] = MCH_CORE_DIMM_B_SIZE; 337 dimm_dlrank[0] = MCH_CORE_DIMM_B_DUAL_RANK; 338 dimm_szmask[1] = MCH_CORE_DIMM_A_SIZE; 339 dimm_dlrank[1] = MCH_CORE_DIMM_A_DUAL_RANK; 340 } 341 342 dimm_size[0] = __SHIFTOUT(dimm_ch, dimm_szmask[0]); 343 dimm_size[1] = __SHIFTOUT(dimm_ch, dimm_szmask[1]); 344 if (dimm_size[0] == 0 && dimm_size[1] == 0) 345 return; 346 347 if (bootverbose) { 348 int ecc; 349 350 ecc = __SHIFTOUT(dimm_ch, MCH_E3_DIMM_ECC); 351 if (ecc == MCH_E3_DIMM_ECC_NONE) { 352 ecc_printf(sc, "channel%d, no ECC active\n", chanid); 353 } else if (ecc == MCH_E3_DIMM_ECC_ALL) { 354 ecc_printf(sc, "channel%d, ECC active IO/logic\n", 355 chanid); 356 } else { 357 if (sc->ecc_ver == ECC_E3_VER_1 || 358 sc->ecc_ver == ECC_E3_VER_2) { 359 if (ecc == MCH_E3_DIMM_ECC_IO) { 360 ecc_printf(sc, "channel%d, " 361 "ECC active IO\n", chanid); 362 } else { 363 ecc_printf(sc, "channel%d, " 364 "ECC active logic\n", chanid); 365 } 366 } else { /* v3 */ 367 ecc_printf(sc, "channel%d, " 368 "invalid ECC active 0x%x\n", chanid, ecc); 369 } 370 } 371 } 372 373 chan->chan_id = chanid; 374 chan->chan_errlog0 = errlog0; 375 376 rank = 0; 377 for (dimm = 0; dimm < ECC_E3_CHAN_DIMM_MAX; ++dimm) { 378 struct ecc_e3_dimm *dimm_sc; 379 struct ecc_e3_rank *rk; 380 struct ksensor *sens; 381 382 if (dimm_size[dimm] == 0) 383 continue; 384 385 dimm_sc = kmalloc(sizeof(*dimm_sc), M_DEVBUF, 386 M_WAITOK | M_ZERO); 387 dimm_sc->dimm_softc = dimm_create(0, chanid, dimm); 388 389 sens = &dimm_sc->dimm_sensor; 390 ksnprintf(sens->desc, sizeof(sens->desc), 391 "node0 chan%d DIMM%d ecc", chanid, dimm); 392 sens->type = SENSOR_ECC; 393 sensor_set(sens, 0, SENSOR_S_OK); 394 dimm_sensor_attach(dimm_sc->dimm_softc, sens); 395 396 TAILQ_INSERT_TAIL(&sc->ecc_dimm, dimm_sc, dimm_link); 397 398 KKASSERT(rank < ECC_E3_CHAN_RANK_MAX - 1); 399 rk = &chan->chan_rank[rank]; 400 rank++; 401 rk->rank_dimm_sc = dimm_sc; 402 if (dimm_ch & dimm_dlrank[dimm]) { 403 rk = &chan->chan_rank[rank]; 404 rank++; 405 rk->rank_dimm_sc = dimm_sc; 406 } 407 } 408 chan->chan_rank_cnt = rank; 409 } 410 411 static void 412 ecc_e3_errlog(struct ecc_e3_softc *sc, boolean_t crit) 413 { 414 int i; 415 416 for (i = 0; i < ECC_E3_CHAN_MAX; ++i) { 417 struct ecc_e3_chan *chan = &sc->ecc_chan[i]; 418 419 if (chan->chan_errlog0 != 0) 420 ecc_e3_errlog_ch(sc, chan, crit); 421 } 422 } 423 424 static void 425 ecc_e3_errlog_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan, 426 boolean_t crit) 427 { 428 uint32_t err0; 429 int rank; 430 431 err0 = CSR_READ_4(sc, chan->chan_errlog0); 432 if ((err0 & (MCH_E3_ERRLOG0_CERRSTS | MCH_E3_ERRLOG0_MERRSTS)) == 0) 433 return; 434 435 rank = __SHIFTOUT(err0, MCH_E3_ERRLOG0_ERRRANK); 436 if (rank >= chan->chan_rank_cnt) { 437 ecc_printf(sc, "channel%d rank%d %serror\n", chan->chan_id, 438 rank, crit ? "critical " : ""); 439 } else { 440 struct ecc_e3_dimm *dimm_sc; 441 442 dimm_sc = chan->chan_rank[rank].rank_dimm_sc; 443 dimm_sensor_ecc_add(dimm_sc->dimm_softc, &dimm_sc->dimm_sensor, 444 1, crit); 445 } 446 } 447 448 static int 449 ecc_e3_detach(device_t dev) 450 { 451 struct ecc_e3_softc *sc = device_get_softc(dev); 452 453 ecc_e3_stop(sc); 454 455 if (sc->ecc_parent != NULL) { 456 struct ecc_e3_dimm *dimm_sc; 457 458 while ((dimm_sc = TAILQ_FIRST(&sc->ecc_dimm)) != NULL) { 459 TAILQ_REMOVE(&sc->ecc_dimm, dimm_sc, dimm_link); 460 dimm_sensor_detach(dimm_sc->dimm_softc, 461 &dimm_sc->dimm_sensor); 462 dimm_destroy(dimm_sc->dimm_softc); 463 464 kfree(dimm_sc, M_DEVBUF); 465 } 466 } else { 467 sensordev_deinstall(&sc->ecc_sensdev); 468 } 469 return 0; 470 } 471 472 static void 473 ecc_e3_shutdown(device_t dev) 474 { 475 ecc_e3_stop(device_get_softc(dev)); 476 } 477 478 static void 479 ecc_e3_stop(struct ecc_e3_softc *sc) 480 { 481 if (sc->ecc_flags & ECC_E3_FLAG_SENSTASK) 482 sensor_task_unregister(sc); 483 } 484 485 static void 486 ecc_e3_sensor_update(struct ecc_e3_softc *sc, boolean_t crit) 487 { 488 enum sensor_status status; 489 490 sc->ecc_count++; 491 if (!crit && sc->ecc_count >= sc->ecc_thresh) 492 crit = TRUE; 493 494 if (crit && (sc->ecc_flags & ECC_E3_FLAG_CRIT) == 0) { 495 char ecc_str[16]; 496 497 ksnprintf(ecc_str, sizeof(ecc_str), "%d", sc->ecc_count); 498 devctl_notify("ecc", "ECC", ecc_str, "node=0"); 499 500 ecc_printf(sc, "too many ECC errors %d\n", sc->ecc_count); 501 sc->ecc_flags |= ECC_E3_FLAG_CRIT; 502 } 503 504 if (sc->ecc_flags & ECC_E3_FLAG_CRIT) 505 status = SENSOR_S_CRIT; 506 else 507 status = SENSOR_S_OK; 508 sensor_set(&sc->ecc_sens, sc->ecc_count, status); 509 } 510