xref: /freebsd/sys/x86/x86/mca.c (revision c7046f76)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2009 Hudson River Trading LLC
5  * Written by: John H. Baldwin <jhb@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  * Support for x86 machine check architecture.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #ifdef __amd64__
38 #define	DEV_APIC
39 #else
40 #include "opt_apic.h"
41 #endif
42 
43 #include <sys/param.h>
44 #include <sys/bus.h>
45 #include <sys/interrupt.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/mutex.h>
50 #include <sys/proc.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/sysctl.h>
54 #include <sys/systm.h>
55 #include <sys/taskqueue.h>
56 #include <machine/intr_machdep.h>
57 #include <x86/apicvar.h>
58 #include <machine/cpu.h>
59 #include <machine/cputypes.h>
60 #include <x86/mca.h>
61 #include <machine/md_var.h>
62 #include <machine/specialreg.h>
63 
64 /* Modes for mca_scan() */
65 enum scan_mode {
66 	POLLED,
67 	MCE,
68 	CMCI,
69 };
70 
71 #ifdef DEV_APIC
72 /*
73  * State maintained for each monitored MCx bank to control the
74  * corrected machine check interrupt threshold.
75  */
76 struct cmc_state {
77 	int	max_threshold;
78 	time_t	last_intr;
79 };
80 
81 struct amd_et_state {
82 	int	cur_threshold;
83 	time_t	last_intr;
84 };
85 #endif
86 
87 struct mca_internal {
88 	struct mca_record rec;
89 	STAILQ_ENTRY(mca_internal) link;
90 };
91 
92 struct mca_enumerator_ops {
93         unsigned int (*ctl)(int);
94         unsigned int (*status)(int);
95         unsigned int (*addr)(int);
96         unsigned int (*misc)(int);
97 };
98 
99 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
100 
101 static volatile int mca_count;	/* Number of records stored. */
102 static int mca_banks;		/* Number of per-CPU register banks. */
103 static int mca_maxcount = -1;	/* Limit on records stored. (-1 = unlimited) */
104 
105 static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
106     "Machine Check Architecture");
107 
108 static int mca_enabled = 1;
109 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
110     "Administrative toggle for machine check support");
111 
112 static int log_corrected = 1;
113 SYSCTL_INT(_hw_mca, OID_AUTO, log_corrected, CTLFLAG_RWTUN, &log_corrected, 0,
114     "Log corrected errors to the console");
115 
116 static int amd10h_L1TP = 1;
117 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
118     "Administrative toggle for logging of level one TLB parity (L1TP) errors");
119 
120 static int intel6h_HSD131;
121 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
122     "Administrative toggle for logging of spurious corrected errors");
123 
124 int workaround_erratum383;
125 SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
126     &workaround_erratum383, 0,
127     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
128 
129 static STAILQ_HEAD(, mca_internal) mca_freelist;
130 static int mca_freecount;
131 static STAILQ_HEAD(, mca_internal) mca_records;
132 static STAILQ_HEAD(, mca_internal) mca_pending;
133 static int mca_ticks = 300;
134 static struct taskqueue *mca_tq;
135 static struct task mca_resize_task;
136 static struct timeout_task mca_scan_task;
137 static struct mtx mca_lock;
138 
139 static unsigned int
140 mca_ia32_ctl_reg(int bank)
141 {
142 	return (MSR_MC_CTL(bank));
143 }
144 
145 static unsigned int
146 mca_ia32_status_reg(int bank)
147 {
148 	return (MSR_MC_STATUS(bank));
149 }
150 
151 static unsigned int
152 mca_ia32_addr_reg(int bank)
153 {
154 	return (MSR_MC_ADDR(bank));
155 }
156 
157 static unsigned int
158 mca_ia32_misc_reg(int bank)
159 {
160 	return (MSR_MC_MISC(bank));
161 }
162 
163 static unsigned int
164 mca_smca_ctl_reg(int bank)
165 {
166         return (MSR_SMCA_MC_CTL(bank));
167 }
168 
169 static unsigned int
170 mca_smca_status_reg(int bank)
171 {
172         return (MSR_SMCA_MC_STATUS(bank));
173 }
174 
175 static unsigned int
176 mca_smca_addr_reg(int bank)
177 {
178         return (MSR_SMCA_MC_ADDR(bank));
179 }
180 
181 static unsigned int
182 mca_smca_misc_reg(int bank)
183 {
184         return (MSR_SMCA_MC_MISC(bank));
185 }
186 
187 static struct mca_enumerator_ops mca_msr_ops = {
188         .ctl    = mca_ia32_ctl_reg,
189         .status = mca_ia32_status_reg,
190         .addr   = mca_ia32_addr_reg,
191         .misc   = mca_ia32_misc_reg
192 };
193 
194 #ifdef DEV_APIC
195 static struct cmc_state **cmc_state;		/* Indexed by cpuid, bank. */
196 static struct amd_et_state **amd_et_state;	/* Indexed by cpuid, bank. */
197 static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
198 
199 static int amd_elvt = -1;
200 
201 static inline bool
202 amd_thresholding_supported(void)
203 {
204 	if (cpu_vendor_id != CPU_VENDOR_AMD &&
205 	    cpu_vendor_id != CPU_VENDOR_HYGON)
206 		return (false);
207 	/*
208 	 * The RASCap register is wholly reserved in families 0x10-0x15 (through model 1F).
209 	 *
210 	 * It begins to be documented in family 0x15 model 30 and family 0x16,
211 	 * but neither of these families documents the ScalableMca bit, which
212 	 * supposedly defines the presence of this feature on family 0x17.
213 	 */
214 	if (CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16)
215 		return (true);
216 	if (CPUID_TO_FAMILY(cpu_id) >= 0x17)
217 		return ((amd_rascap & AMDRAS_SCALABLE_MCA) != 0);
218 	return (false);
219 }
220 #endif
221 
222 static inline bool
223 cmci_supported(uint64_t mcg_cap)
224 {
225 	/*
226 	 * MCG_CAP_CMCI_P bit is reserved in AMD documentation.  Until
227 	 * it is defined, do not use it to check for CMCI support.
228 	 */
229 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
230 		return (false);
231 	return ((mcg_cap & MCG_CAP_CMCI_P) != 0);
232 }
233 
234 static inline bool
235 tes_supported(uint64_t mcg_cap)
236 {
237 
238 	/*
239 	 * MCG_CAP_TES_P bit is reserved in AMD documentation.  Until
240 	 * it is defined, do not use it to check for TES support.
241 	 */
242 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
243 		return (false);
244 	return ((mcg_cap & MCG_CAP_TES_P) != 0);
245 }
246 
247 static inline bool
248 ser_supported(uint64_t mcg_cap)
249 {
250 
251 	return (tes_supported(mcg_cap) && (mcg_cap & MCG_CAP_SER_P) != 0);
252 }
253 
254 static int
255 sysctl_positive_int(SYSCTL_HANDLER_ARGS)
256 {
257 	int error, value;
258 
259 	value = *(int *)arg1;
260 	error = sysctl_handle_int(oidp, &value, 0, req);
261 	if (error || req->newptr == NULL)
262 		return (error);
263 	if (value <= 0)
264 		return (EINVAL);
265 	*(int *)arg1 = value;
266 	return (0);
267 }
268 
269 static int
270 sysctl_mca_records(SYSCTL_HANDLER_ARGS)
271 {
272 	int *name = (int *)arg1;
273 	u_int namelen = arg2;
274 	struct mca_record record;
275 	struct mca_internal *rec;
276 	int i;
277 
278 	if (namelen != 1)
279 		return (EINVAL);
280 
281 	if (name[0] < 0 || name[0] >= mca_count)
282 		return (EINVAL);
283 
284 	mtx_lock_spin(&mca_lock);
285 	if (name[0] >= mca_count) {
286 		mtx_unlock_spin(&mca_lock);
287 		return (EINVAL);
288 	}
289 	i = 0;
290 	STAILQ_FOREACH(rec, &mca_records, link) {
291 		if (i == name[0]) {
292 			record = rec->rec;
293 			break;
294 		}
295 		i++;
296 	}
297 	mtx_unlock_spin(&mca_lock);
298 	return (SYSCTL_OUT(req, &record, sizeof(record)));
299 }
300 
301 static const char *
302 mca_error_ttype(uint16_t mca_error)
303 {
304 
305 	switch ((mca_error & 0x000c) >> 2) {
306 	case 0:
307 		return ("I");
308 	case 1:
309 		return ("D");
310 	case 2:
311 		return ("G");
312 	}
313 	return ("?");
314 }
315 
316 static const char *
317 mca_error_level(uint16_t mca_error)
318 {
319 
320 	switch (mca_error & 0x0003) {
321 	case 0:
322 		return ("L0");
323 	case 1:
324 		return ("L1");
325 	case 2:
326 		return ("L2");
327 	case 3:
328 		return ("LG");
329 	}
330 	return ("L?");
331 }
332 
333 static const char *
334 mca_error_request(uint16_t mca_error)
335 {
336 
337 	switch ((mca_error & 0x00f0) >> 4) {
338 	case 0x0:
339 		return ("ERR");
340 	case 0x1:
341 		return ("RD");
342 	case 0x2:
343 		return ("WR");
344 	case 0x3:
345 		return ("DRD");
346 	case 0x4:
347 		return ("DWR");
348 	case 0x5:
349 		return ("IRD");
350 	case 0x6:
351 		return ("PREFETCH");
352 	case 0x7:
353 		return ("EVICT");
354 	case 0x8:
355 		return ("SNOOP");
356 	}
357 	return ("???");
358 }
359 
360 static const char *
361 mca_error_mmtype(uint16_t mca_error)
362 {
363 
364 	switch ((mca_error & 0x70) >> 4) {
365 	case 0x0:
366 		return ("GEN");
367 	case 0x1:
368 		return ("RD");
369 	case 0x2:
370 		return ("WR");
371 	case 0x3:
372 		return ("AC");
373 	case 0x4:
374 		return ("MS");
375 	}
376 	return ("???");
377 }
378 
379 static const char *
380 mca_addres_mode(uint64_t mca_misc)
381 {
382 
383 	switch ((mca_misc & MC_MISC_ADDRESS_MODE) >> 6) {
384 	case 0x0:
385 		return ("Segment Offset");
386 	case 0x1:
387 		return ("Linear Address");
388 	case 0x2:
389 		return ("Physical Address");
390 	case 0x3:
391 		return ("Memory Address");
392 	case 0x7:
393 		return ("Generic");
394 	}
395 	return ("???");
396 }
397 
398 static int
399 mca_mute(const struct mca_record *rec)
400 {
401 
402 	/*
403 	 * Skip spurious corrected parity errors generated by Intel Haswell-
404 	 * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48
405 	 * erratum respectively), unless reporting is enabled.
406 	 * Note that these errors also have been observed with the D0-stepping
407 	 * of Haswell, while at least initially the CPU specification updates
408 	 * suggested only the C0-stepping to be affected.  Similarly, Celeron
409 	 * 2955U with a CPU ID of 0x45 apparently are also concerned with the
410 	 * same problem, with HSM142 only referring to 0x3c and 0x46.
411 	 */
412 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
413 	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
414 	    (CPUID_TO_MODEL(cpu_id) == 0x3c ||	/* HSD131, HSM142, HSW131 */
415 	    CPUID_TO_MODEL(cpu_id) == 0x3d ||	/* BDM48 */
416 	    CPUID_TO_MODEL(cpu_id) == 0x45 ||
417 	    CPUID_TO_MODEL(cpu_id) == 0x46) &&	/* HSM142 */
418 	    rec->mr_bank == 0 &&
419 	    (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 &&
420 	    !intel6h_HSD131)
421 	    	return (1);
422 
423 	return (0);
424 }
425 
426 /* Dump details about a single machine check. */
427 static void
428 mca_log(const struct mca_record *rec)
429 {
430 	uint16_t mca_error;
431 
432 	if (mca_mute(rec))
433 		return;
434 
435 	if (!log_corrected && (rec->mr_status & MC_STATUS_UC) == 0 &&
436 	    (!tes_supported(rec->mr_mcg_cap) ||
437 	    ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) != 0x2))
438 		return;
439 
440 	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
441 	    (long long)rec->mr_status);
442 	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
443 	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
444 	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
445 	    rec->mr_cpu_id, rec->mr_apic_id);
446 	printf("MCA: CPU %d ", rec->mr_cpu);
447 	if (rec->mr_status & MC_STATUS_UC)
448 		printf("UNCOR ");
449 	else {
450 		printf("COR ");
451 		if (cmci_supported(rec->mr_mcg_cap))
452 			printf("(%lld) ", ((long long)rec->mr_status &
453 			    MC_STATUS_COR_COUNT) >> 38);
454 		if (tes_supported(rec->mr_mcg_cap)) {
455 			switch ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) {
456 			case 0x1:
457 				printf("(Green) ");
458 			case 0x2:
459 				printf("(Yellow) ");
460 			}
461 		}
462 	}
463 	if (rec->mr_status & MC_STATUS_EN)
464 		printf("EN ");
465 	if (rec->mr_status & MC_STATUS_PCC)
466 		printf("PCC ");
467 	if (ser_supported(rec->mr_mcg_cap)) {
468 		if (rec->mr_status & MC_STATUS_S)
469 			printf("S ");
470 		if (rec->mr_status & MC_STATUS_AR)
471 			printf("AR ");
472 	}
473 	if (rec->mr_status & MC_STATUS_OVER)
474 		printf("OVER ");
475 	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
476 	switch (mca_error) {
477 		/* Simple error codes. */
478 	case 0x0000:
479 		printf("no error");
480 		break;
481 	case 0x0001:
482 		printf("unclassified error");
483 		break;
484 	case 0x0002:
485 		printf("ucode ROM parity error");
486 		break;
487 	case 0x0003:
488 		printf("external error");
489 		break;
490 	case 0x0004:
491 		printf("FRC error");
492 		break;
493 	case 0x0005:
494 		printf("internal parity error");
495 		break;
496 	case 0x0006:
497 		printf("SMM handler code access violation");
498 		break;
499 	case 0x0400:
500 		printf("internal timer error");
501 		break;
502 	case 0x0e0b:
503 		printf("generic I/O error");
504 		if (rec->mr_cpu_vendor_id == CPU_VENDOR_INTEL &&
505 		    (rec->mr_status & MC_STATUS_MISCV)) {
506 			printf(" (pci%d:%d:%d:%d)",
507 			    (int)((rec->mr_misc & MC_MISC_PCIE_SEG) >> 32),
508 			    (int)((rec->mr_misc & MC_MISC_PCIE_BUS) >> 24),
509 			    (int)((rec->mr_misc & MC_MISC_PCIE_SLOT) >> 19),
510 			    (int)((rec->mr_misc & MC_MISC_PCIE_FUNC) >> 16));
511 		}
512 		break;
513 	default:
514 		if ((mca_error & 0xfc00) == 0x0400) {
515 			printf("internal error %x", mca_error & 0x03ff);
516 			break;
517 		}
518 
519 		/* Compound error codes. */
520 
521 		/* Memory hierarchy error. */
522 		if ((mca_error & 0xeffc) == 0x000c) {
523 			printf("%s memory error", mca_error_level(mca_error));
524 			break;
525 		}
526 
527 		/* TLB error. */
528 		if ((mca_error & 0xeff0) == 0x0010) {
529 			printf("%sTLB %s error", mca_error_ttype(mca_error),
530 			    mca_error_level(mca_error));
531 			break;
532 		}
533 
534 		/* Memory controller error. */
535 		if ((mca_error & 0xef80) == 0x0080) {
536 			printf("%s channel ", mca_error_mmtype(mca_error));
537 			if ((mca_error & 0x000f) != 0x000f)
538 				printf("%d", mca_error & 0x000f);
539 			else
540 				printf("??");
541 			printf(" memory error");
542 			break;
543 		}
544 
545 		/* Cache error. */
546 		if ((mca_error & 0xef00) == 0x0100) {
547 			printf("%sCACHE %s %s error",
548 			    mca_error_ttype(mca_error),
549 			    mca_error_level(mca_error),
550 			    mca_error_request(mca_error));
551 			break;
552 		}
553 
554 		/* Extended memory error. */
555 		if ((mca_error & 0xef80) == 0x0280) {
556 			printf("%s channel ", mca_error_mmtype(mca_error));
557 			if ((mca_error & 0x000f) != 0x000f)
558 				printf("%d", mca_error & 0x000f);
559 			else
560 				printf("??");
561 			printf(" extended memory error");
562 			break;
563 		}
564 
565 		/* Bus and/or Interconnect error. */
566 		if ((mca_error & 0xe800) == 0x0800) {
567 			printf("BUS%s ", mca_error_level(mca_error));
568 			switch ((mca_error & 0x0600) >> 9) {
569 			case 0:
570 				printf("Source");
571 				break;
572 			case 1:
573 				printf("Responder");
574 				break;
575 			case 2:
576 				printf("Observer");
577 				break;
578 			default:
579 				printf("???");
580 				break;
581 			}
582 			printf(" %s ", mca_error_request(mca_error));
583 			switch ((mca_error & 0x000c) >> 2) {
584 			case 0:
585 				printf("Memory");
586 				break;
587 			case 2:
588 				printf("I/O");
589 				break;
590 			case 3:
591 				printf("Other");
592 				break;
593 			default:
594 				printf("???");
595 				break;
596 			}
597 			if (mca_error & 0x0100)
598 				printf(" timed out");
599 			break;
600 		}
601 
602 		printf("unknown error %x", mca_error);
603 		break;
604 	}
605 	printf("\n");
606 	if (rec->mr_status & MC_STATUS_ADDRV) {
607 		printf("MCA: Address 0x%llx", (long long)rec->mr_addr);
608 		if (ser_supported(rec->mr_mcg_cap) &&
609 		    (rec->mr_status & MC_STATUS_MISCV)) {
610 			printf(" (Mode: %s, LSB: %d)",
611 			    mca_addres_mode(rec->mr_misc),
612 			    (int)(rec->mr_misc & MC_MISC_RA_LSB));
613 		}
614 		printf("\n");
615 	}
616 	if (rec->mr_status & MC_STATUS_MISCV)
617 		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
618 }
619 
620 static bool
621 mca_is_mce(uint64_t mcg_cap, uint64_t status, bool *recoverablep)
622 {
623 
624 	/* Corrected error. */
625 	if ((status & MC_STATUS_UC) == 0)
626 		return (0);
627 
628 	/* Spurious MCA error. */
629 	if ((status & MC_STATUS_EN) == 0)
630 		return (0);
631 
632 	/* The processor does not support software error recovery. */
633 	if (!ser_supported(mcg_cap)) {
634 		*recoverablep = false;
635 		return (1);
636 	}
637 
638 	/* Context might have been corrupted. */
639 	if (status & MC_STATUS_PCC) {
640 		*recoverablep = false;
641 		return (1);
642 	}
643 
644 	/* Uncorrected software recoverable. */
645 	if (status & MC_STATUS_S) {
646 		/* Action required vs optional. */
647 		if (status & MC_STATUS_AR)
648 			*recoverablep = false;
649 		return (1);
650 	}
651 
652 	/* Uncorrected no action required. */
653 	return (0);
654 }
655 
656 static int
657 mca_check_status(enum scan_mode mode, uint64_t mcg_cap, int bank,
658     struct mca_record *rec, bool *recoverablep)
659 {
660 	uint64_t status;
661 	u_int p[4];
662 	bool mce, recover;
663 
664 	status = rdmsr(mca_msr_ops.status(bank));
665 	if (!(status & MC_STATUS_VAL))
666 		return (0);
667 
668 	recover = *recoverablep;
669 	mce = mca_is_mce(mcg_cap, status, &recover);
670 	if (mce != (mode == MCE))
671 		return (0);
672 	*recoverablep = recover;
673 
674 	/* Save exception information. */
675 	rec->mr_status = status;
676 	rec->mr_bank = bank;
677 	rec->mr_addr = 0;
678 	if (status & MC_STATUS_ADDRV)
679 		rec->mr_addr = rdmsr(mca_msr_ops.addr(bank));
680 	rec->mr_misc = 0;
681 	if (status & MC_STATUS_MISCV)
682 		rec->mr_misc = rdmsr(mca_msr_ops.misc(bank));
683 	rec->mr_tsc = rdtsc();
684 	rec->mr_apic_id = PCPU_GET(apic_id);
685 	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
686 	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
687 	rec->mr_cpu_id = cpu_id;
688 	rec->mr_cpu_vendor_id = cpu_vendor_id;
689 	rec->mr_cpu = PCPU_GET(cpuid);
690 
691 	/*
692 	 * Clear machine check.  Don't do this for uncorrectable
693 	 * errors so that the BIOS can see them.
694 	 */
695 	if (!mce || recover) {
696 		wrmsr(mca_msr_ops.status(bank), 0);
697 		do_cpuid(0, p);
698 	}
699 	return (1);
700 }
701 
702 static void
703 mca_resize_freelist(void)
704 {
705 	struct mca_internal *next, *rec;
706 	STAILQ_HEAD(, mca_internal) tmplist;
707 	int count, i, desired_max, desired_min;
708 
709 	/*
710 	 * Ensure we have at least one record for each bank and one
711 	 * record per CPU, but no more than twice that amount.
712 	 */
713 	desired_min = imax(mp_ncpus, mca_banks);
714 	desired_max = imax(mp_ncpus, mca_banks) * 2;
715 	STAILQ_INIT(&tmplist);
716 	mtx_lock_spin(&mca_lock);
717 	while (mca_freecount > desired_max) {
718 		rec = STAILQ_FIRST(&mca_freelist);
719 		KASSERT(rec != NULL, ("mca_freecount is %d, but list is empty",
720 		    mca_freecount));
721 		STAILQ_REMOVE_HEAD(&mca_freelist, link);
722 		mca_freecount--;
723 		STAILQ_INSERT_TAIL(&tmplist, rec, link);
724 	}
725 	while (mca_freecount < desired_min) {
726 		count = desired_min - mca_freecount;
727 		mtx_unlock_spin(&mca_lock);
728 		for (i = 0; i < count; i++) {
729 			rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
730 			STAILQ_INSERT_TAIL(&tmplist, rec, link);
731 		}
732 		mtx_lock_spin(&mca_lock);
733 		STAILQ_CONCAT(&mca_freelist, &tmplist);
734 		mca_freecount += count;
735 	}
736 	mtx_unlock_spin(&mca_lock);
737 	STAILQ_FOREACH_SAFE(rec, &tmplist, link, next)
738 		free(rec, M_MCA);
739 }
740 
741 static void
742 mca_resize(void *context, int pending)
743 {
744 
745 	mca_resize_freelist();
746 }
747 
748 static void
749 mca_record_entry(enum scan_mode mode, const struct mca_record *record)
750 {
751 	struct mca_internal *rec;
752 
753 	if (mode == POLLED) {
754 		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
755 		mtx_lock_spin(&mca_lock);
756 	} else {
757 		mtx_lock_spin(&mca_lock);
758 		rec = STAILQ_FIRST(&mca_freelist);
759 		if (rec == NULL) {
760 			printf("MCA: Unable to allocate space for an event.\n");
761 			mca_log(record);
762 			mtx_unlock_spin(&mca_lock);
763 			return;
764 		}
765 		STAILQ_REMOVE_HEAD(&mca_freelist, link);
766 		mca_freecount--;
767 	}
768 
769 	rec->rec = *record;
770 	STAILQ_INSERT_TAIL(&mca_pending, rec, link);
771 	mtx_unlock_spin(&mca_lock);
772 }
773 
774 #ifdef DEV_APIC
775 /*
776  * Update the interrupt threshold for a CMCI.  The strategy is to use
777  * a low trigger that interrupts as soon as the first event occurs.
778  * However, if a steady stream of events arrive, the threshold is
779  * increased until the interrupts are throttled to once every
780  * cmc_throttle seconds or the periodic scan.  If a periodic scan
781  * finds that the threshold is too high, it is lowered.
782  */
783 static int
784 update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
785     int cur_threshold, int max_threshold)
786 {
787 	u_int delta;
788 	int limit;
789 
790 	delta = (u_int)(time_uptime - last_intr);
791 	limit = cur_threshold;
792 
793 	/*
794 	 * If an interrupt was received less than cmc_throttle seconds
795 	 * since the previous interrupt and the count from the current
796 	 * event is greater than or equal to the current threshold,
797 	 * double the threshold up to the max.
798 	 */
799 	if (mode == CMCI && valid) {
800 		if (delta < cmc_throttle && count >= limit &&
801 		    limit < max_threshold) {
802 			limit = min(limit << 1, max_threshold);
803 		}
804 		return (limit);
805 	}
806 
807 	/*
808 	 * When the banks are polled, check to see if the threshold
809 	 * should be lowered.
810 	 */
811 	if (mode != POLLED)
812 		return (limit);
813 
814 	/* If a CMCI occurred recently, do nothing for now. */
815 	if (delta < cmc_throttle)
816 		return (limit);
817 
818 	/*
819 	 * Compute a new limit based on the average rate of events per
820 	 * cmc_throttle seconds since the last interrupt.
821 	 */
822 	if (valid) {
823 		limit = count * cmc_throttle / delta;
824 		if (limit <= 0)
825 			limit = 1;
826 		else if (limit > max_threshold)
827 			limit = max_threshold;
828 	} else {
829 		limit = 1;
830 	}
831 	return (limit);
832 }
833 
834 static void
835 cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
836 {
837 	struct cmc_state *cc;
838 	uint64_t ctl;
839 	int cur_threshold, new_threshold;
840 	int count;
841 
842 	/* Fetch the current limit for this bank. */
843 	cc = &cmc_state[PCPU_GET(cpuid)][bank];
844 	ctl = rdmsr(MSR_MC_CTL2(bank));
845 	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
846 	cur_threshold = ctl & MC_CTL2_THRESHOLD;
847 
848 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
849 	    cur_threshold, cc->max_threshold);
850 
851 	if (mode == CMCI && valid)
852 		cc->last_intr = time_uptime;
853 	if (new_threshold != cur_threshold) {
854 		ctl &= ~MC_CTL2_THRESHOLD;
855 		ctl |= new_threshold;
856 		wrmsr(MSR_MC_CTL2(bank), ctl);
857 	}
858 }
859 
860 static void
861 amd_thresholding_update(enum scan_mode mode, int bank, int valid)
862 {
863 	struct amd_et_state *cc;
864 	uint64_t misc;
865 	int new_threshold;
866 	int count;
867 
868 	cc = &amd_et_state[PCPU_GET(cpuid)][bank];
869 	misc = rdmsr(mca_msr_ops.misc(bank));
870 	count = (misc & MC_MISC_AMD_CNT_MASK) >> MC_MISC_AMD_CNT_SHIFT;
871 	count = count - (MC_MISC_AMD_CNT_MAX - cc->cur_threshold);
872 
873 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
874 	    cc->cur_threshold, MC_MISC_AMD_CNT_MAX);
875 
876 	cc->cur_threshold = new_threshold;
877 	misc &= ~MC_MISC_AMD_CNT_MASK;
878 	misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
879 	    << MC_MISC_AMD_CNT_SHIFT;
880 	misc &= ~MC_MISC_AMD_OVERFLOW;
881 	wrmsr(mca_msr_ops.misc(bank), misc);
882 	if (mode == CMCI && valid)
883 		cc->last_intr = time_uptime;
884 }
885 #endif
886 
887 /*
888  * This scans all the machine check banks of the current CPU to see if
889  * there are any machine checks.  Any non-recoverable errors are
890  * reported immediately via mca_log().  The current thread must be
891  * pinned when this is called.  The 'mode' parameter indicates if we
892  * are being called from the MC exception handler, the CMCI handler,
893  * or the periodic poller.
894  */
895 static int
896 mca_scan(enum scan_mode mode, bool *recoverablep)
897 {
898 	struct mca_record rec;
899 	uint64_t mcg_cap;
900 	int count = 0, i, valid;
901 
902 	mcg_cap = rdmsr(MSR_MCG_CAP);
903 	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
904 #ifdef DEV_APIC
905 		/*
906 		 * For a CMCI, only check banks this CPU is
907 		 * responsible for.
908 		 */
909 		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
910 			continue;
911 #endif
912 
913 		valid = mca_check_status(mode, mcg_cap, i, &rec, recoverablep);
914 		if (valid) {
915 			count++;
916 			if (*recoverablep)
917 				mca_record_entry(mode, &rec);
918 			else
919 				mca_log(&rec);
920 		}
921 
922 #ifdef DEV_APIC
923 		/*
924 		 * If this is a bank this CPU monitors via CMCI,
925 		 * update the threshold.
926 		 */
927 		if (PCPU_GET(cmci_mask) & 1 << i) {
928 			if (cmc_state != NULL)
929 				cmci_update(mode, i, valid, &rec);
930 			else
931 				amd_thresholding_update(mode, i, valid);
932 		}
933 #endif
934 	}
935 	return (count);
936 }
937 
938 /*
939  * Store a new record on the mca_records list while enforcing
940  * mca_maxcount.
941  */
942 static void
943 mca_store_record(struct mca_internal *mca)
944 {
945 
946 	/*
947 	 * If we are storing no records (mca_maxcount == 0),
948 	 * we just free this record.
949 	 *
950 	 * If we are storing records (mca_maxcount != 0) and
951 	 * we have free space on the list, store the record
952 	 * and increment mca_count.
953 	 *
954 	 * If we are storing records and we do not have free
955 	 * space on the list, store the new record at the
956 	 * tail and free the oldest one from the head.
957 	 */
958 	if (mca_maxcount != 0)
959 		STAILQ_INSERT_TAIL(&mca_records, mca, link);
960 	if (mca_maxcount < 0 || mca_count < mca_maxcount)
961 		mca_count++;
962 	else {
963 		if (mca_maxcount != 0) {
964 			mca = STAILQ_FIRST(&mca_records);
965 			STAILQ_REMOVE_HEAD(&mca_records, link);
966 		}
967 		STAILQ_INSERT_TAIL(&mca_freelist, mca, link);
968 		mca_freecount++;
969 	}
970 }
971 
972 /*
973  * Do the work to process machine check records which have just been
974  * gathered. Print any pending logs to the console. Queue them for storage.
975  * Trigger a resizing of the free list.
976  */
977 static void
978 mca_process_records(enum scan_mode mode)
979 {
980 	struct mca_internal *mca;
981 
982 	mtx_lock_spin(&mca_lock);
983 	while ((mca = STAILQ_FIRST(&mca_pending)) != NULL) {
984 		STAILQ_REMOVE_HEAD(&mca_pending, link);
985 		mca_log(&mca->rec);
986 		mca_store_record(mca);
987 	}
988 	mtx_unlock_spin(&mca_lock);
989 	if (mode == POLLED)
990 		mca_resize_freelist();
991 	else if (!cold)
992 		taskqueue_enqueue(mca_tq, &mca_resize_task);
993 }
994 
995 /*
996  * Scan the machine check banks on all CPUs by binding to each CPU in
997  * turn.  If any of the CPUs contained new machine check records, log
998  * them to the console.
999  */
1000 static void
1001 mca_scan_cpus(void *context, int pending)
1002 {
1003 	struct thread *td;
1004 	int cpu;
1005 	bool recoverable = true;
1006 
1007 	mca_resize_freelist();
1008 	td = curthread;
1009 	thread_lock(td);
1010 	CPU_FOREACH(cpu) {
1011 		sched_bind(td, cpu);
1012 		thread_unlock(td);
1013 		mca_scan(POLLED, &recoverable);
1014 		thread_lock(td);
1015 		sched_unbind(td);
1016 	}
1017 	thread_unlock(td);
1018 	if (!STAILQ_EMPTY(&mca_pending))
1019 		mca_process_records(POLLED);
1020 	taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1021 	    mca_ticks * SBT_1S, 0, C_PREL(1));
1022 }
1023 
1024 static int
1025 sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
1026 {
1027 	int error, i;
1028 
1029 	i = 0;
1030 	error = sysctl_handle_int(oidp, &i, 0, req);
1031 	if (error)
1032 		return (error);
1033 	if (i)
1034 		taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1035 		    0, 0, 0);
1036 	return (0);
1037 }
1038 
1039 static int
1040 sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS)
1041 {
1042 	struct mca_internal *mca;
1043 	int error, i;
1044 	bool doresize;
1045 
1046 	i = mca_maxcount;
1047 	error = sysctl_handle_int(oidp, &i, 0, req);
1048 	if (error || req->newptr == NULL)
1049 		return (error);
1050 	mtx_lock_spin(&mca_lock);
1051 	mca_maxcount = i;
1052 	doresize = false;
1053 	if (mca_maxcount >= 0)
1054 		while (mca_count > mca_maxcount) {
1055 			mca = STAILQ_FIRST(&mca_records);
1056 			STAILQ_REMOVE_HEAD(&mca_records, link);
1057 			mca_count--;
1058 			STAILQ_INSERT_TAIL(&mca_freelist, mca, link);
1059 			mca_freecount++;
1060 			doresize = true;
1061 		}
1062 	mtx_unlock_spin(&mca_lock);
1063 	if (doresize && !cold)
1064 		taskqueue_enqueue(mca_tq, &mca_resize_task);
1065 	return (error);
1066 }
1067 
1068 static void
1069 mca_startup(void *dummy)
1070 {
1071 
1072 	if (mca_banks <= 0)
1073 		return;
1074 
1075 	/* CMCIs during boot may have claimed items from the freelist. */
1076 	mca_resize_freelist();
1077 
1078 	taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
1079 	taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1080 	    mca_ticks * SBT_1S, 0, C_PREL(1));
1081 }
1082 #ifdef EARLY_AP_STARTUP
1083 SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
1084 #else
1085 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
1086 #endif
1087 
1088 #ifdef DEV_APIC
1089 static void
1090 cmci_setup(void)
1091 {
1092 	int i;
1093 
1094 	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA,
1095 	    M_WAITOK);
1096 	for (i = 0; i <= mp_maxid; i++)
1097 		cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
1098 		    M_MCA, M_WAITOK | M_ZERO);
1099 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1100 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1101 	    &cmc_throttle, 0, sysctl_positive_int, "I",
1102 	    "Interval in seconds to throttle corrected MC interrupts");
1103 }
1104 
1105 static void
1106 amd_thresholding_setup(void)
1107 {
1108 	u_int i;
1109 
1110 	amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state *),
1111 	    M_MCA, M_WAITOK);
1112 	for (i = 0; i <= mp_maxid; i++)
1113 		amd_et_state[i] = malloc(sizeof(struct amd_et_state) *
1114 		    mca_banks, M_MCA, M_WAITOK | M_ZERO);
1115 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1116 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1117 	    &cmc_throttle, 0, sysctl_positive_int, "I",
1118 	    "Interval in seconds to throttle corrected MC interrupts");
1119 }
1120 #endif
1121 
1122 static void
1123 mca_setup(uint64_t mcg_cap)
1124 {
1125 
1126 	/*
1127 	 * On AMD Family 10h processors, unless logging of level one TLB
1128 	 * parity (L1TP) errors is disabled, enable the recommended workaround
1129 	 * for Erratum 383.
1130 	 */
1131 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
1132 	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
1133 		workaround_erratum383 = 1;
1134 
1135 	mca_banks = mcg_cap & MCG_CAP_COUNT;
1136 	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
1137 	STAILQ_INIT(&mca_records);
1138 	STAILQ_INIT(&mca_pending);
1139 	mca_tq = taskqueue_create_fast("mca", M_WAITOK,
1140 	    taskqueue_thread_enqueue, &mca_tq);
1141 	TIMEOUT_TASK_INIT(mca_tq, &mca_scan_task, 0, mca_scan_cpus, NULL);
1142 	STAILQ_INIT(&mca_freelist);
1143 	TASK_INIT(&mca_resize_task, 0, mca_resize, NULL);
1144 	mca_resize_freelist();
1145 	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1146 	    "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
1147 	    "Record count");
1148 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1149 	    "maxcount", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1150 	    &mca_maxcount, 0, sysctl_mca_maxcount, "I",
1151 	    "Maximum record count (-1 is unlimited)");
1152 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1153 	    "interval", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1154 	    &mca_ticks, 0, sysctl_positive_int, "I",
1155 	    "Periodic interval in seconds to scan for machine checks");
1156 	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1157 	    "records", CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mca_records,
1158 	    "Machine check records");
1159 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1160 	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1161 	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
1162 #ifdef DEV_APIC
1163 	if (cmci_supported(mcg_cap))
1164 		cmci_setup();
1165 	else if (amd_thresholding_supported())
1166 		amd_thresholding_setup();
1167 #endif
1168 }
1169 
1170 #ifdef DEV_APIC
1171 /*
1172  * See if we should monitor CMCI for this bank.  If CMCI_EN is already
1173  * set in MC_CTL2, then another CPU is responsible for this bank, so
1174  * ignore it.  If CMCI_EN returns zero after being set, then this bank
1175  * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
1176  * now monitor this bank.
1177  */
1178 static void
1179 cmci_monitor(int i)
1180 {
1181 	struct cmc_state *cc;
1182 	uint64_t ctl;
1183 
1184 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1185 
1186 	/*
1187 	 * It is possible for some APs to report CMCI support even if the BSP
1188 	 * does not, apparently due to a BIOS bug.
1189 	 */
1190 	if (cmc_state == NULL) {
1191 		if (bootverbose) {
1192 			printf(
1193 		    "AP %d (%d,%d) reports CMCI support but the BSP does not\n",
1194 			    PCPU_GET(cpuid), PCPU_GET(apic_id),
1195 			    PCPU_GET(acpi_id));
1196 		}
1197 		return;
1198 	}
1199 
1200 	ctl = rdmsr(MSR_MC_CTL2(i));
1201 	if (ctl & MC_CTL2_CMCI_EN)
1202 		/* Already monitored by another CPU. */
1203 		return;
1204 
1205 	/* Set the threshold to one event for now. */
1206 	ctl &= ~MC_CTL2_THRESHOLD;
1207 	ctl |= MC_CTL2_CMCI_EN | 1;
1208 	wrmsr(MSR_MC_CTL2(i), ctl);
1209 	ctl = rdmsr(MSR_MC_CTL2(i));
1210 	if (!(ctl & MC_CTL2_CMCI_EN))
1211 		/* This bank does not support CMCI. */
1212 		return;
1213 
1214 	cc = &cmc_state[PCPU_GET(cpuid)][i];
1215 
1216 	/* Determine maximum threshold. */
1217 	ctl &= ~MC_CTL2_THRESHOLD;
1218 	ctl |= 0x7fff;
1219 	wrmsr(MSR_MC_CTL2(i), ctl);
1220 	ctl = rdmsr(MSR_MC_CTL2(i));
1221 	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
1222 
1223 	/* Start off with a threshold of 1. */
1224 	ctl &= ~MC_CTL2_THRESHOLD;
1225 	ctl |= 1;
1226 	wrmsr(MSR_MC_CTL2(i), ctl);
1227 
1228 	/* Mark this bank as monitored. */
1229 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
1230 }
1231 
1232 /*
1233  * For resume, reset the threshold for any banks we monitor back to
1234  * one and throw away the timestamp of the last interrupt.
1235  */
1236 static void
1237 cmci_resume(int i)
1238 {
1239 	struct cmc_state *cc;
1240 	uint64_t ctl;
1241 
1242 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1243 
1244 	/* See cmci_monitor(). */
1245 	if (cmc_state == NULL)
1246 		return;
1247 
1248 	/* Ignore banks not monitored by this CPU. */
1249 	if (!(PCPU_GET(cmci_mask) & 1 << i))
1250 		return;
1251 
1252 	cc = &cmc_state[PCPU_GET(cpuid)][i];
1253 	cc->last_intr = 0;
1254 	ctl = rdmsr(MSR_MC_CTL2(i));
1255 	ctl &= ~MC_CTL2_THRESHOLD;
1256 	ctl |= MC_CTL2_CMCI_EN | 1;
1257 	wrmsr(MSR_MC_CTL2(i), ctl);
1258 }
1259 
1260 /*
1261  * Apply an AMD ET configuration to the corresponding MSR.
1262  */
1263 static void
1264 amd_thresholding_start(struct amd_et_state *cc, int bank)
1265 {
1266 	uint64_t misc;
1267 
1268 	KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
1269 
1270 	misc = rdmsr(mca_msr_ops.misc(bank));
1271 
1272 	misc &= ~MC_MISC_AMD_INT_MASK;
1273 	misc |= MC_MISC_AMD_INT_LVT;
1274 
1275 	misc &= ~MC_MISC_AMD_LVT_MASK;
1276 	misc |= (uint64_t)amd_elvt << MC_MISC_AMD_LVT_SHIFT;
1277 
1278 	misc &= ~MC_MISC_AMD_CNT_MASK;
1279 	misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
1280 	    << MC_MISC_AMD_CNT_SHIFT;
1281 
1282 	misc &= ~MC_MISC_AMD_OVERFLOW;
1283 	misc |= MC_MISC_AMD_CNTEN;
1284 
1285 	wrmsr(mca_msr_ops.misc(bank), misc);
1286 }
1287 
1288 static void
1289 amd_thresholding_monitor(int i)
1290 {
1291 	struct amd_et_state *cc;
1292 	uint64_t misc;
1293 
1294 	/*
1295 	 * Kludge: On 10h, banks after 4 are not thresholding but also may have
1296 	 * bogus Valid bits.  Skip them.  This is definitely fixed in 15h, but
1297 	 * I have not investigated whether it is fixed in earlier models.
1298 	 */
1299 	if (CPUID_TO_FAMILY(cpu_id) < 0x15 && i >= 5)
1300 		return;
1301 
1302 	/* The counter must be valid and present. */
1303 	misc = rdmsr(mca_msr_ops.misc(i));
1304 	if ((misc & (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP)) !=
1305 	    (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP))
1306 		return;
1307 
1308 	/* The register should not be locked. */
1309 	if ((misc & MC_MISC_AMD_LOCK) != 0) {
1310 		if (bootverbose)
1311 			printf("%s: 0x%jx: Bank %d: locked\n", __func__,
1312 			    (uintmax_t)misc, i);
1313 		return;
1314 	}
1315 
1316 	/*
1317 	 * If counter is enabled then either the firmware or another CPU
1318 	 * has already claimed it.
1319 	 */
1320 	if ((misc & MC_MISC_AMD_CNTEN) != 0) {
1321 		if (bootverbose)
1322 			printf("%s: 0x%jx: Bank %d: already enabled\n",
1323 			    __func__, (uintmax_t)misc, i);
1324 		return;
1325 	}
1326 
1327 	/*
1328 	 * Configure an Extended Interrupt LVT register for reporting
1329 	 * counter overflows if that feature is supported and the first
1330 	 * extended register is available.
1331 	 */
1332 	amd_elvt = lapic_enable_mca_elvt();
1333 	if (amd_elvt < 0) {
1334 		printf("%s: Bank %d: lapic enable mca elvt failed: %d\n",
1335 		    __func__, i, amd_elvt);
1336 		return;
1337 	}
1338 
1339 	cc = &amd_et_state[PCPU_GET(cpuid)][i];
1340 	cc->cur_threshold = 1;
1341 	amd_thresholding_start(cc, i);
1342 
1343 	/* Mark this bank as monitored. */
1344 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
1345 }
1346 
1347 static void
1348 amd_thresholding_resume(int i)
1349 {
1350 	struct amd_et_state *cc;
1351 
1352 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1353 
1354 	/* Ignore banks not monitored by this CPU. */
1355 	if (!(PCPU_GET(cmci_mask) & 1 << i))
1356 		return;
1357 
1358 	cc = &amd_et_state[PCPU_GET(cpuid)][i];
1359 	cc->last_intr = 0;
1360 	cc->cur_threshold = 1;
1361 	amd_thresholding_start(cc, i);
1362 }
1363 #endif
1364 
1365 /*
1366  * Initializes per-CPU machine check registers and enables corrected
1367  * machine check interrupts.
1368  */
1369 static void
1370 _mca_init(int boot)
1371 {
1372 	uint64_t mcg_cap;
1373 	uint64_t ctl, mask;
1374 	int i, skip, family;
1375 
1376 	family = CPUID_TO_FAMILY(cpu_id);
1377 
1378 	/* MCE is required. */
1379 	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
1380 		return;
1381 
1382 	if (cpu_feature & CPUID_MCA) {
1383 		if (boot)
1384 			PCPU_SET(cmci_mask, 0);
1385 
1386 		mcg_cap = rdmsr(MSR_MCG_CAP);
1387 		if (mcg_cap & MCG_CAP_CTL_P)
1388 			/* Enable MCA features. */
1389 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
1390 		if (IS_BSP() && boot)
1391 			mca_setup(mcg_cap);
1392 
1393 		/*
1394 		 * Disable logging of level one TLB parity (L1TP) errors by
1395 		 * the data cache as an alternative workaround for AMD Family
1396 		 * 10h Erratum 383.  Unlike the recommended workaround, there
1397 		 * is no performance penalty to this workaround.  However,
1398 		 * L1TP errors will go unreported.
1399 		 */
1400 		if (cpu_vendor_id == CPU_VENDOR_AMD && family == 0x10 &&
1401 		    !amd10h_L1TP) {
1402 			mask = rdmsr(MSR_MC0_CTL_MASK);
1403 			if ((mask & (1UL << 5)) == 0)
1404 				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
1405 		}
1406 		if (amd_rascap & AMDRAS_SCALABLE_MCA) {
1407 			mca_msr_ops.ctl = mca_smca_ctl_reg;
1408 			mca_msr_ops.status = mca_smca_status_reg;
1409 			mca_msr_ops.addr = mca_smca_addr_reg;
1410 			mca_msr_ops.misc = mca_smca_misc_reg;
1411 		}
1412 
1413 		/* Enable local MCE if supported. */
1414 		if (cpu_vendor_id == CPU_VENDOR_INTEL &&
1415 		    (mcg_cap & MCG_CAP_LMCE_P) &&
1416 		    (rdmsr(MSR_IA32_FEATURE_CONTROL) &
1417 		     IA32_FEATURE_CONTROL_LMCE_EN))
1418 			wrmsr(MSR_MCG_EXT_CTL, rdmsr(MSR_MCG_EXT_CTL) | 1);
1419 
1420 		/*
1421 		 * The cmci_monitor() must not be executed
1422 		 * simultaneously by several CPUs.
1423 		 */
1424 		if (boot)
1425 			mtx_lock_spin(&mca_lock);
1426 
1427 		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
1428 			/* By default enable logging of all errors. */
1429 			ctl = 0xffffffffffffffffUL;
1430 			skip = 0;
1431 
1432 			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
1433 				/*
1434 				 * For P6 models before Nehalem MC0_CTL is
1435 				 * always enabled and reserved.
1436 				 */
1437 				if (i == 0 && family == 0x6
1438 				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
1439 					skip = 1;
1440 			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
1441 				/* BKDG for Family 10h: unset GartTblWkEn. */
1442 				if (i == MC_AMDNB_BANK && family >= 0xf &&
1443 				    family < 0x17)
1444 					ctl &= ~(1UL << 10);
1445 			}
1446 
1447 			if (!skip)
1448 				wrmsr(mca_msr_ops.ctl(i), ctl);
1449 
1450 #ifdef DEV_APIC
1451 			if (cmci_supported(mcg_cap)) {
1452 				if (boot)
1453 					cmci_monitor(i);
1454 				else
1455 					cmci_resume(i);
1456 			} else if (amd_thresholding_supported()) {
1457 				if (boot)
1458 					amd_thresholding_monitor(i);
1459 				else
1460 					amd_thresholding_resume(i);
1461 			}
1462 #endif
1463 
1464 			/* Clear all errors. */
1465 			wrmsr(mca_msr_ops.status(i), 0);
1466 		}
1467 		if (boot)
1468 			mtx_unlock_spin(&mca_lock);
1469 
1470 #ifdef DEV_APIC
1471 		if (cmci_supported(mcg_cap) &&
1472 		    PCPU_GET(cmci_mask) != 0 && boot)
1473 			lapic_enable_cmc();
1474 #endif
1475 	}
1476 
1477 	load_cr4(rcr4() | CR4_MCE);
1478 }
1479 
1480 /* Must be executed on each CPU during boot. */
1481 void
1482 mca_init(void)
1483 {
1484 
1485 	_mca_init(1);
1486 }
1487 
1488 /* Must be executed on each CPU during resume. */
1489 void
1490 mca_resume(void)
1491 {
1492 
1493 	_mca_init(0);
1494 }
1495 
1496 /*
1497  * The machine check registers for the BSP cannot be initialized until
1498  * the local APIC is initialized.  This happens at SI_SUB_CPU,
1499  * SI_ORDER_SECOND.
1500  */
1501 static void
1502 mca_init_bsp(void *arg __unused)
1503 {
1504 
1505 	mca_init();
1506 }
1507 SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
1508 
1509 /* Called when a machine check exception fires. */
1510 void
1511 mca_intr(void)
1512 {
1513 	uint64_t mcg_status;
1514 	int count;
1515 	bool lmcs, recoverable;
1516 
1517 	if (!(cpu_feature & CPUID_MCA)) {
1518 		/*
1519 		 * Just print the values of the old Pentium registers
1520 		 * and panic.
1521 		 */
1522 		printf("MC Type: 0x%jx  Address: 0x%jx\n",
1523 		    (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
1524 		    (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
1525 		panic("Machine check exception");
1526 	}
1527 
1528 	/* Scan the banks and check for any non-recoverable errors. */
1529 	mcg_status = rdmsr(MSR_MCG_STATUS);
1530 	recoverable = (mcg_status & MCG_STATUS_RIPV) != 0;
1531 	lmcs = (cpu_vendor_id != CPU_VENDOR_INTEL ||
1532 	    (mcg_status & MCG_STATUS_LMCS));
1533 	count = mca_scan(MCE, &recoverable);
1534 
1535 	if (!recoverable) {
1536 		/*
1537 		 * Only panic if the error was detected local to this CPU.
1538 		 * Some errors will assert a machine check on all CPUs, but
1539 		 * only certain CPUs will find a valid bank to log.
1540 		 */
1541 		while (!lmcs && count == 0)
1542 			cpu_spinwait();
1543 
1544 		panic("Unrecoverable machine check exception");
1545 	}
1546 
1547 	/* Clear MCIP. */
1548 	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
1549 }
1550 
1551 #ifdef DEV_APIC
1552 /* Called for a CMCI (correctable machine check interrupt). */
1553 void
1554 cmc_intr(void)
1555 {
1556 	bool recoverable = true;
1557 
1558 	/*
1559 	 * Serialize MCA bank scanning to prevent collisions from
1560 	 * sibling threads.
1561 	 *
1562 	 * If we found anything, log them to the console.
1563 	 */
1564 	if (mca_scan(CMCI, &recoverable) != 0)
1565 		mca_process_records(CMCI);
1566 }
1567 #endif
1568