1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009 Hudson River Trading LLC 5 * Written by: John H. Baldwin <jhb@FreeBSD.org> 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * Support for x86 machine check architecture. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #ifdef __amd64__ 38 #define DEV_APIC 39 #else 40 #include "opt_apic.h" 41 #endif 42 43 #include <sys/param.h> 44 #include <sys/bus.h> 45 #include <sys/interrupt.h> 46 #include <sys/kernel.h> 47 #include <sys/lock.h> 48 #include <sys/malloc.h> 49 #include <sys/mutex.h> 50 #include <sys/proc.h> 51 #include <sys/sched.h> 52 #include <sys/smp.h> 53 #include <sys/sysctl.h> 54 #include <sys/systm.h> 55 #include <sys/taskqueue.h> 56 #include <machine/intr_machdep.h> 57 #include <x86/apicvar.h> 58 #include <machine/cpu.h> 59 #include <machine/cputypes.h> 60 #include <x86/mca.h> 61 #include <machine/md_var.h> 62 #include <machine/specialreg.h> 63 64 /* Modes for mca_scan() */ 65 enum scan_mode { 66 POLLED, 67 MCE, 68 CMCI, 69 }; 70 71 #ifdef DEV_APIC 72 /* 73 * State maintained for each monitored MCx bank to control the 74 * corrected machine check interrupt threshold. 75 */ 76 struct cmc_state { 77 int max_threshold; 78 time_t last_intr; 79 }; 80 81 struct amd_et_state { 82 int cur_threshold; 83 time_t last_intr; 84 }; 85 #endif 86 87 struct mca_internal { 88 struct mca_record rec; 89 STAILQ_ENTRY(mca_internal) link; 90 }; 91 92 struct mca_enumerator_ops { 93 unsigned int (*ctl)(int); 94 unsigned int (*status)(int); 95 unsigned int (*addr)(int); 96 unsigned int (*misc)(int); 97 }; 98 99 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture"); 100 101 static volatile int mca_count; /* Number of records stored. */ 102 static int mca_banks; /* Number of per-CPU register banks. */ 103 static int mca_maxcount = -1; /* Limit on records stored. (-1 = unlimited) */ 104 105 static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 106 "Machine Check Architecture"); 107 108 static int mca_enabled = 1; 109 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0, 110 "Administrative toggle for machine check support"); 111 112 static int log_corrected = 1; 113 SYSCTL_INT(_hw_mca, OID_AUTO, log_corrected, CTLFLAG_RWTUN, &log_corrected, 0, 114 "Log corrected errors to the console"); 115 116 static int amd10h_L1TP = 1; 117 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0, 118 "Administrative toggle for logging of level one TLB parity (L1TP) errors"); 119 120 static int intel6h_HSD131; 121 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0, 122 "Administrative toggle for logging of spurious corrected errors"); 123 124 int workaround_erratum383; 125 SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN, 126 &workaround_erratum383, 0, 127 "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?"); 128 129 static STAILQ_HEAD(, mca_internal) mca_freelist; 130 static int mca_freecount; 131 static STAILQ_HEAD(, mca_internal) mca_records; 132 static STAILQ_HEAD(, mca_internal) mca_pending; 133 static int mca_ticks = 300; 134 static struct taskqueue *mca_tq; 135 static struct task mca_resize_task; 136 static struct timeout_task mca_scan_task; 137 static struct mtx mca_lock; 138 139 static unsigned int 140 mca_ia32_ctl_reg(int bank) 141 { 142 return (MSR_MC_CTL(bank)); 143 } 144 145 static unsigned int 146 mca_ia32_status_reg(int bank) 147 { 148 return (MSR_MC_STATUS(bank)); 149 } 150 151 static unsigned int 152 mca_ia32_addr_reg(int bank) 153 { 154 return (MSR_MC_ADDR(bank)); 155 } 156 157 static unsigned int 158 mca_ia32_misc_reg(int bank) 159 { 160 return (MSR_MC_MISC(bank)); 161 } 162 163 static unsigned int 164 mca_smca_ctl_reg(int bank) 165 { 166 return (MSR_SMCA_MC_CTL(bank)); 167 } 168 169 static unsigned int 170 mca_smca_status_reg(int bank) 171 { 172 return (MSR_SMCA_MC_STATUS(bank)); 173 } 174 175 static unsigned int 176 mca_smca_addr_reg(int bank) 177 { 178 return (MSR_SMCA_MC_ADDR(bank)); 179 } 180 181 static unsigned int 182 mca_smca_misc_reg(int bank) 183 { 184 return (MSR_SMCA_MC_MISC(bank)); 185 } 186 187 static struct mca_enumerator_ops mca_msr_ops = { 188 .ctl = mca_ia32_ctl_reg, 189 .status = mca_ia32_status_reg, 190 .addr = mca_ia32_addr_reg, 191 .misc = mca_ia32_misc_reg 192 }; 193 194 #ifdef DEV_APIC 195 static struct cmc_state **cmc_state; /* Indexed by cpuid, bank. */ 196 static struct amd_et_state **amd_et_state; /* Indexed by cpuid, bank. */ 197 static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */ 198 199 static int amd_elvt = -1; 200 201 static inline bool 202 amd_thresholding_supported(void) 203 { 204 if (cpu_vendor_id != CPU_VENDOR_AMD && 205 cpu_vendor_id != CPU_VENDOR_HYGON) 206 return (false); 207 /* 208 * The RASCap register is wholly reserved in families 0x10-0x15 (through model 1F). 209 * 210 * It begins to be documented in family 0x15 model 30 and family 0x16, 211 * but neither of these families documents the ScalableMca bit, which 212 * supposedly defines the presence of this feature on family 0x17. 213 */ 214 if (CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16) 215 return (true); 216 if (CPUID_TO_FAMILY(cpu_id) >= 0x17) 217 return ((amd_rascap & AMDRAS_SCALABLE_MCA) != 0); 218 return (false); 219 } 220 #endif 221 222 static inline bool 223 cmci_supported(uint64_t mcg_cap) 224 { 225 /* 226 * MCG_CAP_CMCI_P bit is reserved in AMD documentation. Until 227 * it is defined, do not use it to check for CMCI support. 228 */ 229 if (cpu_vendor_id != CPU_VENDOR_INTEL) 230 return (false); 231 return ((mcg_cap & MCG_CAP_CMCI_P) != 0); 232 } 233 234 static inline bool 235 tes_supported(uint64_t mcg_cap) 236 { 237 238 /* 239 * MCG_CAP_TES_P bit is reserved in AMD documentation. Until 240 * it is defined, do not use it to check for TES support. 241 */ 242 if (cpu_vendor_id != CPU_VENDOR_INTEL) 243 return (false); 244 return ((mcg_cap & MCG_CAP_TES_P) != 0); 245 } 246 247 static inline bool 248 ser_supported(uint64_t mcg_cap) 249 { 250 251 return (tes_supported(mcg_cap) && (mcg_cap & MCG_CAP_SER_P) != 0); 252 } 253 254 static int 255 sysctl_positive_int(SYSCTL_HANDLER_ARGS) 256 { 257 int error, value; 258 259 value = *(int *)arg1; 260 error = sysctl_handle_int(oidp, &value, 0, req); 261 if (error || req->newptr == NULL) 262 return (error); 263 if (value <= 0) 264 return (EINVAL); 265 *(int *)arg1 = value; 266 return (0); 267 } 268 269 static int 270 sysctl_mca_records(SYSCTL_HANDLER_ARGS) 271 { 272 int *name = (int *)arg1; 273 u_int namelen = arg2; 274 struct mca_record record; 275 struct mca_internal *rec; 276 int i; 277 278 if (namelen != 1) 279 return (EINVAL); 280 281 if (name[0] < 0 || name[0] >= mca_count) 282 return (EINVAL); 283 284 mtx_lock_spin(&mca_lock); 285 if (name[0] >= mca_count) { 286 mtx_unlock_spin(&mca_lock); 287 return (EINVAL); 288 } 289 i = 0; 290 STAILQ_FOREACH(rec, &mca_records, link) { 291 if (i == name[0]) { 292 record = rec->rec; 293 break; 294 } 295 i++; 296 } 297 mtx_unlock_spin(&mca_lock); 298 return (SYSCTL_OUT(req, &record, sizeof(record))); 299 } 300 301 static const char * 302 mca_error_ttype(uint16_t mca_error) 303 { 304 305 switch ((mca_error & 0x000c) >> 2) { 306 case 0: 307 return ("I"); 308 case 1: 309 return ("D"); 310 case 2: 311 return ("G"); 312 } 313 return ("?"); 314 } 315 316 static const char * 317 mca_error_level(uint16_t mca_error) 318 { 319 320 switch (mca_error & 0x0003) { 321 case 0: 322 return ("L0"); 323 case 1: 324 return ("L1"); 325 case 2: 326 return ("L2"); 327 case 3: 328 return ("LG"); 329 } 330 return ("L?"); 331 } 332 333 static const char * 334 mca_error_request(uint16_t mca_error) 335 { 336 337 switch ((mca_error & 0x00f0) >> 4) { 338 case 0x0: 339 return ("ERR"); 340 case 0x1: 341 return ("RD"); 342 case 0x2: 343 return ("WR"); 344 case 0x3: 345 return ("DRD"); 346 case 0x4: 347 return ("DWR"); 348 case 0x5: 349 return ("IRD"); 350 case 0x6: 351 return ("PREFETCH"); 352 case 0x7: 353 return ("EVICT"); 354 case 0x8: 355 return ("SNOOP"); 356 } 357 return ("???"); 358 } 359 360 static const char * 361 mca_error_mmtype(uint16_t mca_error) 362 { 363 364 switch ((mca_error & 0x70) >> 4) { 365 case 0x0: 366 return ("GEN"); 367 case 0x1: 368 return ("RD"); 369 case 0x2: 370 return ("WR"); 371 case 0x3: 372 return ("AC"); 373 case 0x4: 374 return ("MS"); 375 } 376 return ("???"); 377 } 378 379 static const char * 380 mca_addres_mode(uint64_t mca_misc) 381 { 382 383 switch ((mca_misc & MC_MISC_ADDRESS_MODE) >> 6) { 384 case 0x0: 385 return ("Segment Offset"); 386 case 0x1: 387 return ("Linear Address"); 388 case 0x2: 389 return ("Physical Address"); 390 case 0x3: 391 return ("Memory Address"); 392 case 0x7: 393 return ("Generic"); 394 } 395 return ("???"); 396 } 397 398 static int 399 mca_mute(const struct mca_record *rec) 400 { 401 402 /* 403 * Skip spurious corrected parity errors generated by Intel Haswell- 404 * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48 405 * erratum respectively), unless reporting is enabled. 406 * Note that these errors also have been observed with the D0-stepping 407 * of Haswell, while at least initially the CPU specification updates 408 * suggested only the C0-stepping to be affected. Similarly, Celeron 409 * 2955U with a CPU ID of 0x45 apparently are also concerned with the 410 * same problem, with HSM142 only referring to 0x3c and 0x46. 411 */ 412 if (cpu_vendor_id == CPU_VENDOR_INTEL && 413 CPUID_TO_FAMILY(cpu_id) == 0x6 && 414 (CPUID_TO_MODEL(cpu_id) == 0x3c || /* HSD131, HSM142, HSW131 */ 415 CPUID_TO_MODEL(cpu_id) == 0x3d || /* BDM48 */ 416 CPUID_TO_MODEL(cpu_id) == 0x45 || 417 CPUID_TO_MODEL(cpu_id) == 0x46) && /* HSM142 */ 418 rec->mr_bank == 0 && 419 (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 && 420 !intel6h_HSD131) 421 return (1); 422 423 return (0); 424 } 425 426 /* Dump details about a single machine check. */ 427 static void 428 mca_log(const struct mca_record *rec) 429 { 430 uint16_t mca_error; 431 432 if (mca_mute(rec)) 433 return; 434 435 if (!log_corrected && (rec->mr_status & MC_STATUS_UC) == 0 && 436 (!tes_supported(rec->mr_mcg_cap) || 437 ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) != 0x2)) 438 return; 439 440 printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank, 441 (long long)rec->mr_status); 442 printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n", 443 (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status); 444 printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor, 445 rec->mr_cpu_id, rec->mr_apic_id); 446 printf("MCA: CPU %d ", rec->mr_cpu); 447 if (rec->mr_status & MC_STATUS_UC) 448 printf("UNCOR "); 449 else { 450 printf("COR "); 451 if (cmci_supported(rec->mr_mcg_cap)) 452 printf("(%lld) ", ((long long)rec->mr_status & 453 MC_STATUS_COR_COUNT) >> 38); 454 if (tes_supported(rec->mr_mcg_cap)) { 455 switch ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) { 456 case 0x1: 457 printf("(Green) "); 458 case 0x2: 459 printf("(Yellow) "); 460 } 461 } 462 } 463 if (rec->mr_status & MC_STATUS_EN) 464 printf("EN "); 465 if (rec->mr_status & MC_STATUS_PCC) 466 printf("PCC "); 467 if (ser_supported(rec->mr_mcg_cap)) { 468 if (rec->mr_status & MC_STATUS_S) 469 printf("S "); 470 if (rec->mr_status & MC_STATUS_AR) 471 printf("AR "); 472 } 473 if (rec->mr_status & MC_STATUS_OVER) 474 printf("OVER "); 475 mca_error = rec->mr_status & MC_STATUS_MCA_ERROR; 476 switch (mca_error) { 477 /* Simple error codes. */ 478 case 0x0000: 479 printf("no error"); 480 break; 481 case 0x0001: 482 printf("unclassified error"); 483 break; 484 case 0x0002: 485 printf("ucode ROM parity error"); 486 break; 487 case 0x0003: 488 printf("external error"); 489 break; 490 case 0x0004: 491 printf("FRC error"); 492 break; 493 case 0x0005: 494 printf("internal parity error"); 495 break; 496 case 0x0006: 497 printf("SMM handler code access violation"); 498 break; 499 case 0x0400: 500 printf("internal timer error"); 501 break; 502 case 0x0e0b: 503 printf("generic I/O error"); 504 if (rec->mr_cpu_vendor_id == CPU_VENDOR_INTEL && 505 (rec->mr_status & MC_STATUS_MISCV)) { 506 printf(" (pci%d:%d:%d:%d)", 507 (int)((rec->mr_misc & MC_MISC_PCIE_SEG) >> 32), 508 (int)((rec->mr_misc & MC_MISC_PCIE_BUS) >> 24), 509 (int)((rec->mr_misc & MC_MISC_PCIE_SLOT) >> 19), 510 (int)((rec->mr_misc & MC_MISC_PCIE_FUNC) >> 16)); 511 } 512 break; 513 default: 514 if ((mca_error & 0xfc00) == 0x0400) { 515 printf("internal error %x", mca_error & 0x03ff); 516 break; 517 } 518 519 /* Compound error codes. */ 520 521 /* Memory hierarchy error. */ 522 if ((mca_error & 0xeffc) == 0x000c) { 523 printf("%s memory error", mca_error_level(mca_error)); 524 break; 525 } 526 527 /* TLB error. */ 528 if ((mca_error & 0xeff0) == 0x0010) { 529 printf("%sTLB %s error", mca_error_ttype(mca_error), 530 mca_error_level(mca_error)); 531 break; 532 } 533 534 /* Memory controller error. */ 535 if ((mca_error & 0xef80) == 0x0080) { 536 printf("%s channel ", mca_error_mmtype(mca_error)); 537 if ((mca_error & 0x000f) != 0x000f) 538 printf("%d", mca_error & 0x000f); 539 else 540 printf("??"); 541 printf(" memory error"); 542 break; 543 } 544 545 /* Cache error. */ 546 if ((mca_error & 0xef00) == 0x0100) { 547 printf("%sCACHE %s %s error", 548 mca_error_ttype(mca_error), 549 mca_error_level(mca_error), 550 mca_error_request(mca_error)); 551 break; 552 } 553 554 /* Extended memory error. */ 555 if ((mca_error & 0xef80) == 0x0280) { 556 printf("%s channel ", mca_error_mmtype(mca_error)); 557 if ((mca_error & 0x000f) != 0x000f) 558 printf("%d", mca_error & 0x000f); 559 else 560 printf("??"); 561 printf(" extended memory error"); 562 break; 563 } 564 565 /* Bus and/or Interconnect error. */ 566 if ((mca_error & 0xe800) == 0x0800) { 567 printf("BUS%s ", mca_error_level(mca_error)); 568 switch ((mca_error & 0x0600) >> 9) { 569 case 0: 570 printf("Source"); 571 break; 572 case 1: 573 printf("Responder"); 574 break; 575 case 2: 576 printf("Observer"); 577 break; 578 default: 579 printf("???"); 580 break; 581 } 582 printf(" %s ", mca_error_request(mca_error)); 583 switch ((mca_error & 0x000c) >> 2) { 584 case 0: 585 printf("Memory"); 586 break; 587 case 2: 588 printf("I/O"); 589 break; 590 case 3: 591 printf("Other"); 592 break; 593 default: 594 printf("???"); 595 break; 596 } 597 if (mca_error & 0x0100) 598 printf(" timed out"); 599 break; 600 } 601 602 printf("unknown error %x", mca_error); 603 break; 604 } 605 printf("\n"); 606 if (rec->mr_status & MC_STATUS_ADDRV) { 607 printf("MCA: Address 0x%llx", (long long)rec->mr_addr); 608 if (ser_supported(rec->mr_mcg_cap) && 609 (rec->mr_status & MC_STATUS_MISCV)) { 610 printf(" (Mode: %s, LSB: %d)", 611 mca_addres_mode(rec->mr_misc), 612 (int)(rec->mr_misc & MC_MISC_RA_LSB)); 613 } 614 printf("\n"); 615 } 616 if (rec->mr_status & MC_STATUS_MISCV) 617 printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc); 618 } 619 620 static bool 621 mca_is_mce(uint64_t mcg_cap, uint64_t status, bool *recoverablep) 622 { 623 624 /* Corrected error. */ 625 if ((status & MC_STATUS_UC) == 0) 626 return (0); 627 628 /* Spurious MCA error. */ 629 if ((status & MC_STATUS_EN) == 0) 630 return (0); 631 632 /* The processor does not support software error recovery. */ 633 if (!ser_supported(mcg_cap)) { 634 *recoverablep = false; 635 return (1); 636 } 637 638 /* Context might have been corrupted. */ 639 if (status & MC_STATUS_PCC) { 640 *recoverablep = false; 641 return (1); 642 } 643 644 /* Uncorrected software recoverable. */ 645 if (status & MC_STATUS_S) { 646 /* Action required vs optional. */ 647 if (status & MC_STATUS_AR) 648 *recoverablep = false; 649 return (1); 650 } 651 652 /* Uncorrected no action required. */ 653 return (0); 654 } 655 656 static int 657 mca_check_status(enum scan_mode mode, uint64_t mcg_cap, int bank, 658 struct mca_record *rec, bool *recoverablep) 659 { 660 uint64_t status; 661 u_int p[4]; 662 bool mce, recover; 663 664 status = rdmsr(mca_msr_ops.status(bank)); 665 if (!(status & MC_STATUS_VAL)) 666 return (0); 667 668 recover = *recoverablep; 669 mce = mca_is_mce(mcg_cap, status, &recover); 670 if (mce != (mode == MCE)) 671 return (0); 672 *recoverablep = recover; 673 674 /* Save exception information. */ 675 rec->mr_status = status; 676 rec->mr_bank = bank; 677 rec->mr_addr = 0; 678 if (status & MC_STATUS_ADDRV) 679 rec->mr_addr = rdmsr(mca_msr_ops.addr(bank)); 680 rec->mr_misc = 0; 681 if (status & MC_STATUS_MISCV) 682 rec->mr_misc = rdmsr(mca_msr_ops.misc(bank)); 683 rec->mr_tsc = rdtsc(); 684 rec->mr_apic_id = PCPU_GET(apic_id); 685 rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP); 686 rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS); 687 rec->mr_cpu_id = cpu_id; 688 rec->mr_cpu_vendor_id = cpu_vendor_id; 689 rec->mr_cpu = PCPU_GET(cpuid); 690 691 /* 692 * Clear machine check. Don't do this for uncorrectable 693 * errors so that the BIOS can see them. 694 */ 695 if (!mce || recover) { 696 wrmsr(mca_msr_ops.status(bank), 0); 697 do_cpuid(0, p); 698 } 699 return (1); 700 } 701 702 static void 703 mca_resize_freelist(void) 704 { 705 struct mca_internal *next, *rec; 706 STAILQ_HEAD(, mca_internal) tmplist; 707 int count, i, desired_max, desired_min; 708 709 /* 710 * Ensure we have at least one record for each bank and one 711 * record per CPU, but no more than twice that amount. 712 */ 713 desired_min = imax(mp_ncpus, mca_banks); 714 desired_max = imax(mp_ncpus, mca_banks) * 2; 715 STAILQ_INIT(&tmplist); 716 mtx_lock_spin(&mca_lock); 717 while (mca_freecount > desired_max) { 718 rec = STAILQ_FIRST(&mca_freelist); 719 KASSERT(rec != NULL, ("mca_freecount is %d, but list is empty", 720 mca_freecount)); 721 STAILQ_REMOVE_HEAD(&mca_freelist, link); 722 mca_freecount--; 723 STAILQ_INSERT_TAIL(&tmplist, rec, link); 724 } 725 while (mca_freecount < desired_min) { 726 count = desired_min - mca_freecount; 727 mtx_unlock_spin(&mca_lock); 728 for (i = 0; i < count; i++) { 729 rec = malloc(sizeof(*rec), M_MCA, M_WAITOK); 730 STAILQ_INSERT_TAIL(&tmplist, rec, link); 731 } 732 mtx_lock_spin(&mca_lock); 733 STAILQ_CONCAT(&mca_freelist, &tmplist); 734 mca_freecount += count; 735 } 736 mtx_unlock_spin(&mca_lock); 737 STAILQ_FOREACH_SAFE(rec, &tmplist, link, next) 738 free(rec, M_MCA); 739 } 740 741 static void 742 mca_resize(void *context, int pending) 743 { 744 745 mca_resize_freelist(); 746 } 747 748 static void 749 mca_record_entry(enum scan_mode mode, const struct mca_record *record) 750 { 751 struct mca_internal *rec; 752 753 if (mode == POLLED) { 754 rec = malloc(sizeof(*rec), M_MCA, M_WAITOK); 755 mtx_lock_spin(&mca_lock); 756 } else { 757 mtx_lock_spin(&mca_lock); 758 rec = STAILQ_FIRST(&mca_freelist); 759 if (rec == NULL) { 760 printf("MCA: Unable to allocate space for an event.\n"); 761 mca_log(record); 762 mtx_unlock_spin(&mca_lock); 763 return; 764 } 765 STAILQ_REMOVE_HEAD(&mca_freelist, link); 766 mca_freecount--; 767 } 768 769 rec->rec = *record; 770 STAILQ_INSERT_TAIL(&mca_pending, rec, link); 771 mtx_unlock_spin(&mca_lock); 772 } 773 774 #ifdef DEV_APIC 775 /* 776 * Update the interrupt threshold for a CMCI. The strategy is to use 777 * a low trigger that interrupts as soon as the first event occurs. 778 * However, if a steady stream of events arrive, the threshold is 779 * increased until the interrupts are throttled to once every 780 * cmc_throttle seconds or the periodic scan. If a periodic scan 781 * finds that the threshold is too high, it is lowered. 782 */ 783 static int 784 update_threshold(enum scan_mode mode, int valid, int last_intr, int count, 785 int cur_threshold, int max_threshold) 786 { 787 u_int delta; 788 int limit; 789 790 delta = (u_int)(time_uptime - last_intr); 791 limit = cur_threshold; 792 793 /* 794 * If an interrupt was received less than cmc_throttle seconds 795 * since the previous interrupt and the count from the current 796 * event is greater than or equal to the current threshold, 797 * double the threshold up to the max. 798 */ 799 if (mode == CMCI && valid) { 800 if (delta < cmc_throttle && count >= limit && 801 limit < max_threshold) { 802 limit = min(limit << 1, max_threshold); 803 } 804 return (limit); 805 } 806 807 /* 808 * When the banks are polled, check to see if the threshold 809 * should be lowered. 810 */ 811 if (mode != POLLED) 812 return (limit); 813 814 /* If a CMCI occurred recently, do nothing for now. */ 815 if (delta < cmc_throttle) 816 return (limit); 817 818 /* 819 * Compute a new limit based on the average rate of events per 820 * cmc_throttle seconds since the last interrupt. 821 */ 822 if (valid) { 823 limit = count * cmc_throttle / delta; 824 if (limit <= 0) 825 limit = 1; 826 else if (limit > max_threshold) 827 limit = max_threshold; 828 } else { 829 limit = 1; 830 } 831 return (limit); 832 } 833 834 static void 835 cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec) 836 { 837 struct cmc_state *cc; 838 uint64_t ctl; 839 int cur_threshold, new_threshold; 840 int count; 841 842 /* Fetch the current limit for this bank. */ 843 cc = &cmc_state[PCPU_GET(cpuid)][bank]; 844 ctl = rdmsr(MSR_MC_CTL2(bank)); 845 count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; 846 cur_threshold = ctl & MC_CTL2_THRESHOLD; 847 848 new_threshold = update_threshold(mode, valid, cc->last_intr, count, 849 cur_threshold, cc->max_threshold); 850 851 if (mode == CMCI && valid) 852 cc->last_intr = time_uptime; 853 if (new_threshold != cur_threshold) { 854 ctl &= ~MC_CTL2_THRESHOLD; 855 ctl |= new_threshold; 856 wrmsr(MSR_MC_CTL2(bank), ctl); 857 } 858 } 859 860 static void 861 amd_thresholding_update(enum scan_mode mode, int bank, int valid) 862 { 863 struct amd_et_state *cc; 864 uint64_t misc; 865 int new_threshold; 866 int count; 867 868 cc = &amd_et_state[PCPU_GET(cpuid)][bank]; 869 misc = rdmsr(mca_msr_ops.misc(bank)); 870 count = (misc & MC_MISC_AMD_CNT_MASK) >> MC_MISC_AMD_CNT_SHIFT; 871 count = count - (MC_MISC_AMD_CNT_MAX - cc->cur_threshold); 872 873 new_threshold = update_threshold(mode, valid, cc->last_intr, count, 874 cc->cur_threshold, MC_MISC_AMD_CNT_MAX); 875 876 cc->cur_threshold = new_threshold; 877 misc &= ~MC_MISC_AMD_CNT_MASK; 878 misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold) 879 << MC_MISC_AMD_CNT_SHIFT; 880 misc &= ~MC_MISC_AMD_OVERFLOW; 881 wrmsr(mca_msr_ops.misc(bank), misc); 882 if (mode == CMCI && valid) 883 cc->last_intr = time_uptime; 884 } 885 #endif 886 887 /* 888 * This scans all the machine check banks of the current CPU to see if 889 * there are any machine checks. Any non-recoverable errors are 890 * reported immediately via mca_log(). The current thread must be 891 * pinned when this is called. The 'mode' parameter indicates if we 892 * are being called from the MC exception handler, the CMCI handler, 893 * or the periodic poller. 894 */ 895 static int 896 mca_scan(enum scan_mode mode, bool *recoverablep) 897 { 898 struct mca_record rec; 899 uint64_t mcg_cap; 900 int count = 0, i, valid; 901 902 mcg_cap = rdmsr(MSR_MCG_CAP); 903 for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { 904 #ifdef DEV_APIC 905 /* 906 * For a CMCI, only check banks this CPU is 907 * responsible for. 908 */ 909 if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i)) 910 continue; 911 #endif 912 913 valid = mca_check_status(mode, mcg_cap, i, &rec, recoverablep); 914 if (valid) { 915 count++; 916 if (*recoverablep) 917 mca_record_entry(mode, &rec); 918 else 919 mca_log(&rec); 920 } 921 922 #ifdef DEV_APIC 923 /* 924 * If this is a bank this CPU monitors via CMCI, 925 * update the threshold. 926 */ 927 if (PCPU_GET(cmci_mask) & 1 << i) { 928 if (cmc_state != NULL) 929 cmci_update(mode, i, valid, &rec); 930 else 931 amd_thresholding_update(mode, i, valid); 932 } 933 #endif 934 } 935 return (count); 936 } 937 938 /* 939 * Store a new record on the mca_records list while enforcing 940 * mca_maxcount. 941 */ 942 static void 943 mca_store_record(struct mca_internal *mca) 944 { 945 946 /* 947 * If we are storing no records (mca_maxcount == 0), 948 * we just free this record. 949 * 950 * If we are storing records (mca_maxcount != 0) and 951 * we have free space on the list, store the record 952 * and increment mca_count. 953 * 954 * If we are storing records and we do not have free 955 * space on the list, store the new record at the 956 * tail and free the oldest one from the head. 957 */ 958 if (mca_maxcount != 0) 959 STAILQ_INSERT_TAIL(&mca_records, mca, link); 960 if (mca_maxcount < 0 || mca_count < mca_maxcount) 961 mca_count++; 962 else { 963 if (mca_maxcount != 0) { 964 mca = STAILQ_FIRST(&mca_records); 965 STAILQ_REMOVE_HEAD(&mca_records, link); 966 } 967 STAILQ_INSERT_TAIL(&mca_freelist, mca, link); 968 mca_freecount++; 969 } 970 } 971 972 /* 973 * Do the work to process machine check records which have just been 974 * gathered. Print any pending logs to the console. Queue them for storage. 975 * Trigger a resizing of the free list. 976 */ 977 static void 978 mca_process_records(enum scan_mode mode) 979 { 980 struct mca_internal *mca; 981 982 mtx_lock_spin(&mca_lock); 983 while ((mca = STAILQ_FIRST(&mca_pending)) != NULL) { 984 STAILQ_REMOVE_HEAD(&mca_pending, link); 985 mca_log(&mca->rec); 986 mca_store_record(mca); 987 } 988 mtx_unlock_spin(&mca_lock); 989 if (mode == POLLED) 990 mca_resize_freelist(); 991 else if (!cold) 992 taskqueue_enqueue(mca_tq, &mca_resize_task); 993 } 994 995 /* 996 * Scan the machine check banks on all CPUs by binding to each CPU in 997 * turn. If any of the CPUs contained new machine check records, log 998 * them to the console. 999 */ 1000 static void 1001 mca_scan_cpus(void *context, int pending) 1002 { 1003 struct thread *td; 1004 int cpu; 1005 bool recoverable = true; 1006 1007 mca_resize_freelist(); 1008 td = curthread; 1009 thread_lock(td); 1010 CPU_FOREACH(cpu) { 1011 sched_bind(td, cpu); 1012 thread_unlock(td); 1013 mca_scan(POLLED, &recoverable); 1014 thread_lock(td); 1015 sched_unbind(td); 1016 } 1017 thread_unlock(td); 1018 if (!STAILQ_EMPTY(&mca_pending)) 1019 mca_process_records(POLLED); 1020 taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task, 1021 mca_ticks * SBT_1S, 0, C_PREL(1)); 1022 } 1023 1024 static int 1025 sysctl_mca_scan(SYSCTL_HANDLER_ARGS) 1026 { 1027 int error, i; 1028 1029 i = 0; 1030 error = sysctl_handle_int(oidp, &i, 0, req); 1031 if (error) 1032 return (error); 1033 if (i) 1034 taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task, 1035 0, 0, 0); 1036 return (0); 1037 } 1038 1039 static int 1040 sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS) 1041 { 1042 struct mca_internal *mca; 1043 int error, i; 1044 bool doresize; 1045 1046 i = mca_maxcount; 1047 error = sysctl_handle_int(oidp, &i, 0, req); 1048 if (error || req->newptr == NULL) 1049 return (error); 1050 mtx_lock_spin(&mca_lock); 1051 mca_maxcount = i; 1052 doresize = false; 1053 if (mca_maxcount >= 0) 1054 while (mca_count > mca_maxcount) { 1055 mca = STAILQ_FIRST(&mca_records); 1056 STAILQ_REMOVE_HEAD(&mca_records, link); 1057 mca_count--; 1058 STAILQ_INSERT_TAIL(&mca_freelist, mca, link); 1059 mca_freecount++; 1060 doresize = true; 1061 } 1062 mtx_unlock_spin(&mca_lock); 1063 if (doresize && !cold) 1064 taskqueue_enqueue(mca_tq, &mca_resize_task); 1065 return (error); 1066 } 1067 1068 static void 1069 mca_startup(void *dummy) 1070 { 1071 1072 if (mca_banks <= 0) 1073 return; 1074 1075 /* CMCIs during boot may have claimed items from the freelist. */ 1076 mca_resize_freelist(); 1077 1078 taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq"); 1079 taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task, 1080 mca_ticks * SBT_1S, 0, C_PREL(1)); 1081 } 1082 #ifdef EARLY_AP_STARTUP 1083 SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL); 1084 #else 1085 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL); 1086 #endif 1087 1088 #ifdef DEV_APIC 1089 static void 1090 cmci_setup(void) 1091 { 1092 int i; 1093 1094 cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA, 1095 M_WAITOK); 1096 for (i = 0; i <= mp_maxid; i++) 1097 cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks, 1098 M_MCA, M_WAITOK | M_ZERO); 1099 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 1100 "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 1101 &cmc_throttle, 0, sysctl_positive_int, "I", 1102 "Interval in seconds to throttle corrected MC interrupts"); 1103 } 1104 1105 static void 1106 amd_thresholding_setup(void) 1107 { 1108 u_int i; 1109 1110 amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state *), 1111 M_MCA, M_WAITOK); 1112 for (i = 0; i <= mp_maxid; i++) 1113 amd_et_state[i] = malloc(sizeof(struct amd_et_state) * 1114 mca_banks, M_MCA, M_WAITOK | M_ZERO); 1115 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 1116 "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 1117 &cmc_throttle, 0, sysctl_positive_int, "I", 1118 "Interval in seconds to throttle corrected MC interrupts"); 1119 } 1120 #endif 1121 1122 static void 1123 mca_setup(uint64_t mcg_cap) 1124 { 1125 1126 /* 1127 * On AMD Family 10h processors, unless logging of level one TLB 1128 * parity (L1TP) errors is disabled, enable the recommended workaround 1129 * for Erratum 383. 1130 */ 1131 if (cpu_vendor_id == CPU_VENDOR_AMD && 1132 CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP) 1133 workaround_erratum383 = 1; 1134 1135 mca_banks = mcg_cap & MCG_CAP_COUNT; 1136 mtx_init(&mca_lock, "mca", NULL, MTX_SPIN); 1137 STAILQ_INIT(&mca_records); 1138 STAILQ_INIT(&mca_pending); 1139 mca_tq = taskqueue_create_fast("mca", M_WAITOK, 1140 taskqueue_thread_enqueue, &mca_tq); 1141 TIMEOUT_TASK_INIT(mca_tq, &mca_scan_task, 0, mca_scan_cpus, NULL); 1142 STAILQ_INIT(&mca_freelist); 1143 TASK_INIT(&mca_resize_task, 0, mca_resize, NULL); 1144 mca_resize_freelist(); 1145 SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 1146 "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0, 1147 "Record count"); 1148 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 1149 "maxcount", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 1150 &mca_maxcount, 0, sysctl_mca_maxcount, "I", 1151 "Maximum record count (-1 is unlimited)"); 1152 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 1153 "interval", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 1154 &mca_ticks, 0, sysctl_positive_int, "I", 1155 "Periodic interval in seconds to scan for machine checks"); 1156 SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 1157 "records", CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mca_records, 1158 "Machine check records"); 1159 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 1160 "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1161 sysctl_mca_scan, "I", "Force an immediate scan for machine checks"); 1162 #ifdef DEV_APIC 1163 if (cmci_supported(mcg_cap)) 1164 cmci_setup(); 1165 else if (amd_thresholding_supported()) 1166 amd_thresholding_setup(); 1167 #endif 1168 } 1169 1170 #ifdef DEV_APIC 1171 /* 1172 * See if we should monitor CMCI for this bank. If CMCI_EN is already 1173 * set in MC_CTL2, then another CPU is responsible for this bank, so 1174 * ignore it. If CMCI_EN returns zero after being set, then this bank 1175 * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should 1176 * now monitor this bank. 1177 */ 1178 static void 1179 cmci_monitor(int i) 1180 { 1181 struct cmc_state *cc; 1182 uint64_t ctl; 1183 1184 KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); 1185 1186 /* 1187 * It is possible for some APs to report CMCI support even if the BSP 1188 * does not, apparently due to a BIOS bug. 1189 */ 1190 if (cmc_state == NULL) { 1191 if (bootverbose) { 1192 printf( 1193 "AP %d (%d,%d) reports CMCI support but the BSP does not\n", 1194 PCPU_GET(cpuid), PCPU_GET(apic_id), 1195 PCPU_GET(acpi_id)); 1196 } 1197 return; 1198 } 1199 1200 ctl = rdmsr(MSR_MC_CTL2(i)); 1201 if (ctl & MC_CTL2_CMCI_EN) 1202 /* Already monitored by another CPU. */ 1203 return; 1204 1205 /* Set the threshold to one event for now. */ 1206 ctl &= ~MC_CTL2_THRESHOLD; 1207 ctl |= MC_CTL2_CMCI_EN | 1; 1208 wrmsr(MSR_MC_CTL2(i), ctl); 1209 ctl = rdmsr(MSR_MC_CTL2(i)); 1210 if (!(ctl & MC_CTL2_CMCI_EN)) 1211 /* This bank does not support CMCI. */ 1212 return; 1213 1214 cc = &cmc_state[PCPU_GET(cpuid)][i]; 1215 1216 /* Determine maximum threshold. */ 1217 ctl &= ~MC_CTL2_THRESHOLD; 1218 ctl |= 0x7fff; 1219 wrmsr(MSR_MC_CTL2(i), ctl); 1220 ctl = rdmsr(MSR_MC_CTL2(i)); 1221 cc->max_threshold = ctl & MC_CTL2_THRESHOLD; 1222 1223 /* Start off with a threshold of 1. */ 1224 ctl &= ~MC_CTL2_THRESHOLD; 1225 ctl |= 1; 1226 wrmsr(MSR_MC_CTL2(i), ctl); 1227 1228 /* Mark this bank as monitored. */ 1229 PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i); 1230 } 1231 1232 /* 1233 * For resume, reset the threshold for any banks we monitor back to 1234 * one and throw away the timestamp of the last interrupt. 1235 */ 1236 static void 1237 cmci_resume(int i) 1238 { 1239 struct cmc_state *cc; 1240 uint64_t ctl; 1241 1242 KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); 1243 1244 /* See cmci_monitor(). */ 1245 if (cmc_state == NULL) 1246 return; 1247 1248 /* Ignore banks not monitored by this CPU. */ 1249 if (!(PCPU_GET(cmci_mask) & 1 << i)) 1250 return; 1251 1252 cc = &cmc_state[PCPU_GET(cpuid)][i]; 1253 cc->last_intr = 0; 1254 ctl = rdmsr(MSR_MC_CTL2(i)); 1255 ctl &= ~MC_CTL2_THRESHOLD; 1256 ctl |= MC_CTL2_CMCI_EN | 1; 1257 wrmsr(MSR_MC_CTL2(i), ctl); 1258 } 1259 1260 /* 1261 * Apply an AMD ET configuration to the corresponding MSR. 1262 */ 1263 static void 1264 amd_thresholding_start(struct amd_et_state *cc, int bank) 1265 { 1266 uint64_t misc; 1267 1268 KASSERT(amd_elvt >= 0, ("ELVT offset is not set")); 1269 1270 misc = rdmsr(mca_msr_ops.misc(bank)); 1271 1272 misc &= ~MC_MISC_AMD_INT_MASK; 1273 misc |= MC_MISC_AMD_INT_LVT; 1274 1275 misc &= ~MC_MISC_AMD_LVT_MASK; 1276 misc |= (uint64_t)amd_elvt << MC_MISC_AMD_LVT_SHIFT; 1277 1278 misc &= ~MC_MISC_AMD_CNT_MASK; 1279 misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold) 1280 << MC_MISC_AMD_CNT_SHIFT; 1281 1282 misc &= ~MC_MISC_AMD_OVERFLOW; 1283 misc |= MC_MISC_AMD_CNTEN; 1284 1285 wrmsr(mca_msr_ops.misc(bank), misc); 1286 } 1287 1288 static void 1289 amd_thresholding_monitor(int i) 1290 { 1291 struct amd_et_state *cc; 1292 uint64_t misc; 1293 1294 /* 1295 * Kludge: On 10h, banks after 4 are not thresholding but also may have 1296 * bogus Valid bits. Skip them. This is definitely fixed in 15h, but 1297 * I have not investigated whether it is fixed in earlier models. 1298 */ 1299 if (CPUID_TO_FAMILY(cpu_id) < 0x15 && i >= 5) 1300 return; 1301 1302 /* The counter must be valid and present. */ 1303 misc = rdmsr(mca_msr_ops.misc(i)); 1304 if ((misc & (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP)) != 1305 (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP)) 1306 return; 1307 1308 /* The register should not be locked. */ 1309 if ((misc & MC_MISC_AMD_LOCK) != 0) { 1310 if (bootverbose) 1311 printf("%s: 0x%jx: Bank %d: locked\n", __func__, 1312 (uintmax_t)misc, i); 1313 return; 1314 } 1315 1316 /* 1317 * If counter is enabled then either the firmware or another CPU 1318 * has already claimed it. 1319 */ 1320 if ((misc & MC_MISC_AMD_CNTEN) != 0) { 1321 if (bootverbose) 1322 printf("%s: 0x%jx: Bank %d: already enabled\n", 1323 __func__, (uintmax_t)misc, i); 1324 return; 1325 } 1326 1327 /* 1328 * Configure an Extended Interrupt LVT register for reporting 1329 * counter overflows if that feature is supported and the first 1330 * extended register is available. 1331 */ 1332 amd_elvt = lapic_enable_mca_elvt(); 1333 if (amd_elvt < 0) { 1334 printf("%s: Bank %d: lapic enable mca elvt failed: %d\n", 1335 __func__, i, amd_elvt); 1336 return; 1337 } 1338 1339 cc = &amd_et_state[PCPU_GET(cpuid)][i]; 1340 cc->cur_threshold = 1; 1341 amd_thresholding_start(cc, i); 1342 1343 /* Mark this bank as monitored. */ 1344 PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i); 1345 } 1346 1347 static void 1348 amd_thresholding_resume(int i) 1349 { 1350 struct amd_et_state *cc; 1351 1352 KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); 1353 1354 /* Ignore banks not monitored by this CPU. */ 1355 if (!(PCPU_GET(cmci_mask) & 1 << i)) 1356 return; 1357 1358 cc = &amd_et_state[PCPU_GET(cpuid)][i]; 1359 cc->last_intr = 0; 1360 cc->cur_threshold = 1; 1361 amd_thresholding_start(cc, i); 1362 } 1363 #endif 1364 1365 /* 1366 * Initializes per-CPU machine check registers and enables corrected 1367 * machine check interrupts. 1368 */ 1369 static void 1370 _mca_init(int boot) 1371 { 1372 uint64_t mcg_cap; 1373 uint64_t ctl, mask; 1374 int i, skip, family; 1375 1376 family = CPUID_TO_FAMILY(cpu_id); 1377 1378 /* MCE is required. */ 1379 if (!mca_enabled || !(cpu_feature & CPUID_MCE)) 1380 return; 1381 1382 if (cpu_feature & CPUID_MCA) { 1383 if (boot) 1384 PCPU_SET(cmci_mask, 0); 1385 1386 mcg_cap = rdmsr(MSR_MCG_CAP); 1387 if (mcg_cap & MCG_CAP_CTL_P) 1388 /* Enable MCA features. */ 1389 wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE); 1390 if (IS_BSP() && boot) 1391 mca_setup(mcg_cap); 1392 1393 /* 1394 * Disable logging of level one TLB parity (L1TP) errors by 1395 * the data cache as an alternative workaround for AMD Family 1396 * 10h Erratum 383. Unlike the recommended workaround, there 1397 * is no performance penalty to this workaround. However, 1398 * L1TP errors will go unreported. 1399 */ 1400 if (cpu_vendor_id == CPU_VENDOR_AMD && family == 0x10 && 1401 !amd10h_L1TP) { 1402 mask = rdmsr(MSR_MC0_CTL_MASK); 1403 if ((mask & (1UL << 5)) == 0) 1404 wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5)); 1405 } 1406 if (amd_rascap & AMDRAS_SCALABLE_MCA) { 1407 mca_msr_ops.ctl = mca_smca_ctl_reg; 1408 mca_msr_ops.status = mca_smca_status_reg; 1409 mca_msr_ops.addr = mca_smca_addr_reg; 1410 mca_msr_ops.misc = mca_smca_misc_reg; 1411 } 1412 1413 /* Enable local MCE if supported. */ 1414 if (cpu_vendor_id == CPU_VENDOR_INTEL && 1415 (mcg_cap & MCG_CAP_LMCE_P) && 1416 (rdmsr(MSR_IA32_FEATURE_CONTROL) & 1417 IA32_FEATURE_CONTROL_LMCE_EN)) 1418 wrmsr(MSR_MCG_EXT_CTL, rdmsr(MSR_MCG_EXT_CTL) | 1); 1419 1420 /* 1421 * The cmci_monitor() must not be executed 1422 * simultaneously by several CPUs. 1423 */ 1424 if (boot) 1425 mtx_lock_spin(&mca_lock); 1426 1427 for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { 1428 /* By default enable logging of all errors. */ 1429 ctl = 0xffffffffffffffffUL; 1430 skip = 0; 1431 1432 if (cpu_vendor_id == CPU_VENDOR_INTEL) { 1433 /* 1434 * For P6 models before Nehalem MC0_CTL is 1435 * always enabled and reserved. 1436 */ 1437 if (i == 0 && family == 0x6 1438 && CPUID_TO_MODEL(cpu_id) < 0x1a) 1439 skip = 1; 1440 } else if (cpu_vendor_id == CPU_VENDOR_AMD) { 1441 /* BKDG for Family 10h: unset GartTblWkEn. */ 1442 if (i == MC_AMDNB_BANK && family >= 0xf && 1443 family < 0x17) 1444 ctl &= ~(1UL << 10); 1445 } 1446 1447 if (!skip) 1448 wrmsr(mca_msr_ops.ctl(i), ctl); 1449 1450 #ifdef DEV_APIC 1451 if (cmci_supported(mcg_cap)) { 1452 if (boot) 1453 cmci_monitor(i); 1454 else 1455 cmci_resume(i); 1456 } else if (amd_thresholding_supported()) { 1457 if (boot) 1458 amd_thresholding_monitor(i); 1459 else 1460 amd_thresholding_resume(i); 1461 } 1462 #endif 1463 1464 /* Clear all errors. */ 1465 wrmsr(mca_msr_ops.status(i), 0); 1466 } 1467 if (boot) 1468 mtx_unlock_spin(&mca_lock); 1469 1470 #ifdef DEV_APIC 1471 if (cmci_supported(mcg_cap) && 1472 PCPU_GET(cmci_mask) != 0 && boot) 1473 lapic_enable_cmc(); 1474 #endif 1475 } 1476 1477 load_cr4(rcr4() | CR4_MCE); 1478 } 1479 1480 /* Must be executed on each CPU during boot. */ 1481 void 1482 mca_init(void) 1483 { 1484 1485 _mca_init(1); 1486 } 1487 1488 /* Must be executed on each CPU during resume. */ 1489 void 1490 mca_resume(void) 1491 { 1492 1493 _mca_init(0); 1494 } 1495 1496 /* 1497 * The machine check registers for the BSP cannot be initialized until 1498 * the local APIC is initialized. This happens at SI_SUB_CPU, 1499 * SI_ORDER_SECOND. 1500 */ 1501 static void 1502 mca_init_bsp(void *arg __unused) 1503 { 1504 1505 mca_init(); 1506 } 1507 SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL); 1508 1509 /* Called when a machine check exception fires. */ 1510 void 1511 mca_intr(void) 1512 { 1513 uint64_t mcg_status; 1514 int count; 1515 bool lmcs, recoverable; 1516 1517 if (!(cpu_feature & CPUID_MCA)) { 1518 /* 1519 * Just print the values of the old Pentium registers 1520 * and panic. 1521 */ 1522 printf("MC Type: 0x%jx Address: 0x%jx\n", 1523 (uintmax_t)rdmsr(MSR_P5_MC_TYPE), 1524 (uintmax_t)rdmsr(MSR_P5_MC_ADDR)); 1525 panic("Machine check exception"); 1526 } 1527 1528 /* Scan the banks and check for any non-recoverable errors. */ 1529 mcg_status = rdmsr(MSR_MCG_STATUS); 1530 recoverable = (mcg_status & MCG_STATUS_RIPV) != 0; 1531 lmcs = (cpu_vendor_id != CPU_VENDOR_INTEL || 1532 (mcg_status & MCG_STATUS_LMCS)); 1533 count = mca_scan(MCE, &recoverable); 1534 1535 if (!recoverable) { 1536 /* 1537 * Only panic if the error was detected local to this CPU. 1538 * Some errors will assert a machine check on all CPUs, but 1539 * only certain CPUs will find a valid bank to log. 1540 */ 1541 while (!lmcs && count == 0) 1542 cpu_spinwait(); 1543 1544 panic("Unrecoverable machine check exception"); 1545 } 1546 1547 /* Clear MCIP. */ 1548 wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP); 1549 } 1550 1551 #ifdef DEV_APIC 1552 /* Called for a CMCI (correctable machine check interrupt). */ 1553 void 1554 cmc_intr(void) 1555 { 1556 bool recoverable = true; 1557 1558 /* 1559 * Serialize MCA bank scanning to prevent collisions from 1560 * sibling threads. 1561 * 1562 * If we found anything, log them to the console. 1563 */ 1564 if (mca_scan(CMCI, &recoverable) != 0) 1565 mca_process_records(CMCI); 1566 } 1567 #endif 1568