1 /* 2 * Copyright (c) 2011 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Sepherosa Ziehau <sepherosa@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/bus.h> 38 #include <sys/kernel.h> 39 #include <sys/malloc.h> 40 #include <sys/bitops.h> 41 #include <sys/sensors.h> 42 43 #include <bus/pci/pcivar.h> 44 #include <bus/pci/pcireg.h> 45 #include <bus/pci/pcibus.h> 46 #include <bus/pci/pci_cfgreg.h> 47 48 #include <vm/pmap.h> 49 50 #include "coremctl_if.h" 51 #include "pcib_if.h" 52 53 #include <dev/misc/dimm/dimm.h> 54 #include <dev/misc/coremctl/coremctl_reg.h> 55 56 #define ECC_E3_VER_1 1 /* Sandy Bridge */ 57 #define ECC_E3_VER_2 2 /* Ivy Bridge */ 58 #define ECC_E3_VER_3 3 /* Haswell */ 59 60 #define ECC_E3_THRESH_DEFAULT 5 61 62 #define ECC_E3_CHAN_MAX 2 63 #define ECC_E3_CHAN_DIMM_MAX 2 64 #define ECC_E3_DIMM_RANK_MAX 2 65 #define ECC_E3_CHAN_RANK_MAX (ECC_E3_CHAN_DIMM_MAX * ECC_E3_DIMM_RANK_MAX) 66 67 struct ecc_e3_type { 68 uint16_t did; 69 const char *desc; 70 int ver; /* ECC_E3_VER_ */ 71 }; 72 73 struct ecc_e3_dimm { 74 TAILQ_ENTRY(ecc_e3_dimm) dimm_link; 75 struct dimm_softc *dimm_softc; 76 struct ksensor dimm_sensor; 77 }; 78 79 struct ecc_e3_rank { 80 struct ecc_e3_dimm *rank_dimm_sc; 81 }; 82 83 struct ecc_e3_chan { 84 int chan_id; 85 int chan_errlog0; 86 int chan_rank_cnt; 87 struct ecc_e3_rank chan_rank[ECC_E3_CHAN_RANK_MAX]; 88 }; 89 90 struct ecc_e3_softc { 91 device_t ecc_dev; 92 device_t ecc_parent; /* non-NULL if parent has MCHBAR */ 93 int ecc_ver; /* ECC_E3_VER_ */ 94 uint32_t ecc_flags; /* ECC_E3_FLAG_ */ 95 96 struct ecc_e3_chan ecc_chan[ECC_E3_CHAN_MAX]; 97 TAILQ_HEAD(, ecc_e3_dimm) ecc_dimm; 98 99 /* 100 * If the parent does not have MCHBAR, 101 * i.e. no DIMM location information 102 * for the ECC errors, fallback to the 103 * sensor and counters below. 104 */ 105 struct ksensordev ecc_sensdev; 106 struct ksensor ecc_sens; 107 int ecc_count; 108 int ecc_thresh; 109 }; 110 111 #define ECC_E3_FLAG_SENSTASK 0x1 112 #define ECC_E3_FLAG_CRIT 0x2 113 114 #define ecc_printf(sc, fmt, arg...) \ 115 device_printf((sc)->ecc_dev, fmt , ##arg) 116 117 static int ecc_e3_probe(device_t); 118 static int ecc_e3_attach(device_t); 119 static int ecc_e3_detach(device_t); 120 static void ecc_e3_shutdown(device_t); 121 122 static void ecc_e3_attach_ch(struct ecc_e3_softc *, struct ecc_e3_chan *, 123 int, uint32_t, int); 124 static void ecc_e3_errlog(struct ecc_e3_softc *, boolean_t); 125 static void ecc_e3_errlog_ch(struct ecc_e3_softc *, struct ecc_e3_chan *, 126 boolean_t); 127 static void ecc_e3_stop(struct ecc_e3_softc *); 128 129 static void ecc_e3_sensor_task(void *); 130 static void ecc_e3_sensor_update(struct ecc_e3_softc *, boolean_t); 131 132 static const struct ecc_e3_type ecc_e3_types[] = { 133 { PCI_E3V1_MEMCTL_DID, "Intel E3 ECC", ECC_E3_VER_1 }, 134 { PCI_E3V2_MEMCTL_DID, "Intel E3 v2 ECC", ECC_E3_VER_2 }, 135 { PCI_E3V3_MEMCTL_DID, "Intel E3 v3 ECC", ECC_E3_VER_3 }, 136 { 0, NULL, 0 } /* required last entry */ 137 }; 138 139 static device_method_t ecc_e3_methods[] = { 140 /* Device interface */ 141 DEVMETHOD(device_probe, ecc_e3_probe), 142 DEVMETHOD(device_attach, ecc_e3_attach), 143 DEVMETHOD(device_detach, ecc_e3_detach), 144 DEVMETHOD(device_shutdown, ecc_e3_shutdown), 145 DEVMETHOD(device_suspend, bus_generic_suspend), 146 DEVMETHOD(device_resume, bus_generic_resume), 147 DEVMETHOD_END 148 }; 149 150 static driver_t ecc_e3_driver = { 151 "ecc", 152 ecc_e3_methods, 153 sizeof(struct ecc_e3_softc) 154 }; 155 static devclass_t ecc_devclass; 156 DRIVER_MODULE(ecc_e3, coremctl, ecc_e3_driver, ecc_devclass, NULL, NULL); 157 MODULE_DEPEND(ecc_e3, pci, 1, 1, 1); 158 MODULE_DEPEND(ecc_e3, coremctl, 1, 1, 1); 159 160 static __inline uint32_t 161 CSR_READ_4(struct ecc_e3_softc *sc, int ofs) 162 { 163 uint32_t val; 164 int error; 165 166 error = COREMCTL_MCH_READ(sc->ecc_parent, ofs, &val); 167 KASSERT(!error, ("mch read failed")); 168 169 return val; 170 } 171 172 static int 173 ecc_e3_probe(device_t dev) 174 { 175 const struct ecc_e3_type *t; 176 uint16_t did; 177 178 if (pci_get_vendor(dev) != PCI_CORE_MEMCTL_VID) 179 return ENXIO; 180 181 did = pci_get_device(dev); 182 for (t = ecc_e3_types; t->desc != NULL; ++t) { 183 if (t->did == did) { 184 struct ecc_e3_softc *sc = device_get_softc(dev); 185 186 device_set_desc(dev, t->desc); 187 sc->ecc_ver = t->ver; 188 return 0; 189 } 190 } 191 return ENXIO; 192 } 193 194 static int 195 ecc_e3_attach(device_t dev) 196 { 197 struct ecc_e3_softc *sc = device_get_softc(dev); 198 uint32_t val; 199 int error; 200 201 TAILQ_INIT(&sc->ecc_dimm); 202 sc->ecc_dev = dev; 203 204 /* Probe the existance of MCHBAR */ 205 error = COREMCTL_MCH_READ(device_get_parent(dev), MCH_CORE_DIMM_CH0, 206 &val); 207 if (!error) 208 sc->ecc_parent = device_get_parent(dev); 209 210 if (sc->ecc_parent != NULL) { 211 uint32_t dimm_ch0, dimm_ch1; 212 int ecc_active; 213 214 if (bootverbose) { 215 ecc_printf(sc, "LOG0_C0 %#x\n", 216 CSR_READ_4(sc, MCH_E3_ERRLOG0_C0)); 217 ecc_printf(sc, "LOG0_C1 %#x\n", 218 CSR_READ_4(sc, MCH_E3_ERRLOG0_C1)); 219 } 220 221 dimm_ch0 = CSR_READ_4(sc, MCH_CORE_DIMM_CH0); 222 dimm_ch1 = CSR_READ_4(sc, MCH_CORE_DIMM_CH1); 223 224 ecc_e3_attach_ch(sc, &sc->ecc_chan[0], 0, dimm_ch0, 225 MCH_E3_ERRLOG0_C0); 226 ecc_e3_attach_ch(sc, &sc->ecc_chan[1], 1, dimm_ch1, 227 MCH_E3_ERRLOG0_C1); 228 229 ecc_active = 1; 230 if (sc->ecc_ver == ECC_E3_VER_1 || 231 sc->ecc_ver == ECC_E3_VER_2) { 232 if (((dimm_ch0 | dimm_ch1) & MCH_E3_DIMM_ECC) == 233 MCH_E3_DIMM_ECC_NONE) { 234 ecc_active = 0; 235 ecc_printf(sc, "No ECC active\n"); 236 } 237 } else { /* v3 */ 238 uint32_t ecc_mode0, ecc_mode1; 239 240 ecc_mode0 = __SHIFTOUT(dimm_ch0, MCH_E3_DIMM_ECC); 241 ecc_mode1 = __SHIFTOUT(dimm_ch1, MCH_E3_DIMM_ECC); 242 243 /* 244 * Only active ALL/NONE is supported 245 */ 246 247 if (ecc_mode0 != MCH_E3_DIMM_ECC_NONE && 248 ecc_mode0 != MCH_E3_DIMM_ECC_ALL) { 249 ecc_active = 0; 250 ecc_printf(sc, "channel0, invalid ECC " 251 "active 0x%x\n", ecc_mode0); 252 } 253 if (ecc_mode1 != MCH_E3_DIMM_ECC_NONE && 254 ecc_mode1 != MCH_E3_DIMM_ECC_ALL) { 255 ecc_active = 0; 256 ecc_printf(sc, "channel1, invalid ECC " 257 "active 0x%x\n", ecc_mode1); 258 } 259 260 if (ecc_mode0 == MCH_E3_DIMM_ECC_NONE && 261 ecc_mode1 == MCH_E3_DIMM_ECC_NONE) { 262 ecc_active = 0; 263 ecc_printf(sc, "No ECC active\n"); 264 } 265 } 266 267 if (!ecc_active) 268 return 0; 269 } else { 270 ecc_printf(sc, "MCHBAR is not enabled\n"); 271 272 /* 273 * Add hw.sensors.eccN.ecc0 MIB. 274 */ 275 strlcpy(sc->ecc_sensdev.xname, device_get_nameunit(dev), 276 sizeof(sc->ecc_sensdev.xname)); 277 strlcpy(sc->ecc_sens.desc, "node0 ecc", 278 sizeof(sc->ecc_sens.desc)); 279 sc->ecc_sens.type = SENSOR_ECC; 280 sensor_set(&sc->ecc_sens, 0, SENSOR_S_OK); 281 sensor_attach(&sc->ecc_sensdev, &sc->ecc_sens); 282 sensordev_install(&sc->ecc_sensdev); 283 284 sc->ecc_thresh = ECC_E3_THRESH_DEFAULT; 285 SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), 286 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), 287 OID_AUTO, "thresh", CTLFLAG_RW, &sc->ecc_thresh, 0, 288 "Raise alarm once number of ECC errors " 289 "goes above this value"); 290 } 291 292 sc->ecc_flags |= ECC_E3_FLAG_SENSTASK; 293 sensor_task_register(sc, ecc_e3_sensor_task, 1); 294 295 return 0; 296 } 297 298 static void 299 ecc_e3_sensor_task(void *xsc) 300 { 301 struct ecc_e3_softc *sc = xsc; 302 device_t dev = sc->ecc_dev; 303 uint16_t errsts; 304 305 errsts = pci_read_config(dev, PCI_E3_ERRSTS, 2); 306 if (errsts & (PCI_E3_ERRSTS_DSERR | PCI_E3_ERRSTS_DMERR)) { 307 boolean_t crit = FALSE; 308 309 if (errsts & PCI_E3_ERRSTS_DMERR) 310 crit = TRUE; 311 312 if (sc->ecc_parent != NULL) 313 ecc_e3_errlog(sc, crit); 314 else 315 ecc_e3_sensor_update(sc, crit); 316 317 /* Clear pending errors */ 318 pci_write_config(dev, PCI_E3_ERRSTS, errsts, 2); 319 } 320 } 321 322 static void 323 ecc_e3_attach_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan, 324 int chanid, uint32_t dimm_ch, int errlog0) 325 { 326 int dimm_size[ECC_E3_CHAN_DIMM_MAX]; 327 uint32_t dimm_szmask[ECC_E3_CHAN_DIMM_MAX]; 328 uint32_t dimm_dlrank[ECC_E3_CHAN_DIMM_MAX]; 329 int rank, dimm; 330 331 dimm_szmask[0] = MCH_CORE_DIMM_A_SIZE; 332 dimm_dlrank[0] = MCH_CORE_DIMM_A_DUAL_RANK; 333 dimm_szmask[1] = MCH_CORE_DIMM_B_SIZE; 334 dimm_dlrank[1] = MCH_CORE_DIMM_B_DUAL_RANK; 335 if (dimm_ch & MCH_CORE_DIMM_A_SELECT) { 336 dimm_szmask[0] = MCH_CORE_DIMM_B_SIZE; 337 dimm_dlrank[0] = MCH_CORE_DIMM_B_DUAL_RANK; 338 dimm_szmask[1] = MCH_CORE_DIMM_A_SIZE; 339 dimm_dlrank[1] = MCH_CORE_DIMM_A_DUAL_RANK; 340 } 341 342 dimm_size[0] = __SHIFTOUT(dimm_ch, dimm_szmask[0]); 343 dimm_size[1] = __SHIFTOUT(dimm_ch, dimm_szmask[1]); 344 if (dimm_size[0] == 0 && dimm_size[1] == 0) 345 return; 346 347 if (bootverbose) { 348 int ecc; 349 350 ecc = __SHIFTOUT(dimm_ch, MCH_E3_DIMM_ECC); 351 if (ecc == MCH_E3_DIMM_ECC_NONE) { 352 ecc_printf(sc, "channel%d, no ECC active\n", chanid); 353 } else if (ecc == MCH_E3_DIMM_ECC_ALL) { 354 ecc_printf(sc, "channel%d, ECC active IO/logic\n", 355 chanid); 356 } else { 357 if (sc->ecc_ver == ECC_E3_VER_1 || 358 sc->ecc_ver == ECC_E3_VER_2) { 359 if (ecc == MCH_E3_DIMM_ECC_IO) { 360 ecc_printf(sc, "channel%d, " 361 "ECC active IO\n", chanid); 362 } else { 363 ecc_printf(sc, "channel%d, " 364 "ECC active logic\n", chanid); 365 } 366 } else { /* v3 */ 367 ecc_printf(sc, "channel%d, " 368 "invalid ECC active 0x%x\n", chanid, ecc); 369 } 370 } 371 } 372 373 chan->chan_id = chanid; 374 chan->chan_errlog0 = errlog0; 375 376 rank = 0; 377 for (dimm = 0; dimm < ECC_E3_CHAN_DIMM_MAX; ++dimm) { 378 struct ecc_e3_dimm *dimm_sc; 379 struct ecc_e3_rank *rk; 380 struct ksensor *sens; 381 382 if (dimm_size[dimm] == 0) 383 continue; 384 385 dimm_sc = kmalloc(sizeof(*dimm_sc), M_DEVBUF, 386 M_WAITOK | M_ZERO); 387 dimm_sc->dimm_softc = dimm_create(0, chanid, dimm); 388 389 sens = &dimm_sc->dimm_sensor; 390 ksnprintf(sens->desc, sizeof(sens->desc), 391 "node0 chan%d DIMM%d ecc", chanid, dimm); 392 sens->type = SENSOR_ECC; 393 sensor_set(sens, 0, SENSOR_S_OK); 394 dimm_sensor_attach(dimm_sc->dimm_softc, sens); 395 396 TAILQ_INSERT_TAIL(&sc->ecc_dimm, dimm_sc, dimm_link); 397 398 KKASSERT(rank < ECC_E3_CHAN_RANK_MAX - 1); 399 rk = &chan->chan_rank[rank]; 400 rank++; 401 rk->rank_dimm_sc = dimm_sc; 402 if (dimm_ch & dimm_dlrank[dimm]) { 403 rk = &chan->chan_rank[rank]; 404 rank++; 405 rk->rank_dimm_sc = dimm_sc; 406 } 407 } 408 chan->chan_rank_cnt = rank; 409 } 410 411 static void 412 ecc_e3_errlog(struct ecc_e3_softc *sc, boolean_t crit) 413 { 414 int i; 415 416 for (i = 0; i < ECC_E3_CHAN_MAX; ++i) { 417 struct ecc_e3_chan *chan = &sc->ecc_chan[i]; 418 419 if (chan->chan_errlog0 != 0) 420 ecc_e3_errlog_ch(sc, chan, crit); 421 } 422 } 423 424 static void 425 ecc_e3_errlog_ch(struct ecc_e3_softc *sc, struct ecc_e3_chan *chan, 426 boolean_t crit) 427 { 428 uint32_t err0; 429 int rank; 430 431 err0 = CSR_READ_4(sc, chan->chan_errlog0); 432 if ((err0 & (MCH_E3_ERRLOG0_CERRSTS | MCH_E3_ERRLOG0_MERRSTS)) == 0) 433 return; 434 435 rank = __SHIFTOUT(err0, MCH_E3_ERRLOG0_ERRRANK); 436 if (rank >= chan->chan_rank_cnt) { 437 ecc_printf(sc, "channel%d rank%d %serror\n", chan->chan_id, 438 rank, crit ? "critical " : ""); 439 } else { 440 struct ecc_e3_dimm *dimm_sc; 441 442 dimm_sc = chan->chan_rank[rank].rank_dimm_sc; 443 dimm_sensor_ecc_add(dimm_sc->dimm_softc, &dimm_sc->dimm_sensor, 444 1, crit); 445 } 446 } 447 448 static int 449 ecc_e3_detach(device_t dev) 450 { 451 struct ecc_e3_softc *sc = device_get_softc(dev); 452 453 ecc_e3_stop(sc); 454 455 if (sc->ecc_parent != NULL) { 456 struct ecc_e3_dimm *dimm_sc; 457 458 while ((dimm_sc = TAILQ_FIRST(&sc->ecc_dimm)) != NULL) { 459 TAILQ_REMOVE(&sc->ecc_dimm, dimm_sc, dimm_link); 460 dimm_sensor_detach(dimm_sc->dimm_softc, 461 &dimm_sc->dimm_sensor); 462 dimm_destroy(dimm_sc->dimm_softc); 463 464 kfree(dimm_sc, M_DEVBUF); 465 } 466 } else { 467 sensordev_deinstall(&sc->ecc_sensdev); 468 } 469 return 0; 470 } 471 472 static void 473 ecc_e3_shutdown(device_t dev) 474 { 475 ecc_e3_stop(device_get_softc(dev)); 476 } 477 478 static void 479 ecc_e3_stop(struct ecc_e3_softc *sc) 480 { 481 if (sc->ecc_flags & ECC_E3_FLAG_SENSTASK) 482 sensor_task_unregister(sc); 483 } 484 485 static void 486 ecc_e3_sensor_update(struct ecc_e3_softc *sc, boolean_t crit) 487 { 488 enum sensor_status status; 489 490 sc->ecc_count++; 491 if (!crit && sc->ecc_count >= sc->ecc_thresh) 492 crit = TRUE; 493 494 if (crit && (sc->ecc_flags & ECC_E3_FLAG_CRIT) == 0) { 495 char ecc_str[16]; 496 497 ksnprintf(ecc_str, sizeof(ecc_str), "%d", sc->ecc_count); 498 devctl_notify("ecc", "ECC", ecc_str, "node=0"); 499 500 ecc_printf(sc, "too many ECC errors %d\n", sc->ecc_count); 501 sc->ecc_flags |= ECC_E3_FLAG_CRIT; 502 } 503 504 if (sc->ecc_flags & ECC_E3_FLAG_CRIT) 505 status = SENSOR_S_CRIT; 506 else 507 status = SENSOR_S_OK; 508 sensor_set(&sc->ecc_sens, sc->ecc_count, status); 509 } 510