xref: /freebsd/sys/x86/x86/mca.c (revision d93a896e)
1 /*-
2  * Copyright (c) 2009 Hudson River Trading LLC
3  * Written by: John H. Baldwin <jhb@FreeBSD.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * Support for x86 machine check architecture.
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #ifdef __amd64__
36 #define	DEV_APIC
37 #else
38 #include "opt_apic.h"
39 #endif
40 
41 #include <sys/param.h>
42 #include <sys/bus.h>
43 #include <sys/interrupt.h>
44 #include <sys/kernel.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mutex.h>
48 #include <sys/proc.h>
49 #include <sys/sched.h>
50 #include <sys/smp.h>
51 #include <sys/sysctl.h>
52 #include <sys/systm.h>
53 #include <sys/taskqueue.h>
54 #include <machine/intr_machdep.h>
55 #include <x86/apicvar.h>
56 #include <machine/cpu.h>
57 #include <machine/cputypes.h>
58 #include <x86/mca.h>
59 #include <machine/md_var.h>
60 #include <machine/specialreg.h>
61 
62 /* Modes for mca_scan() */
63 enum scan_mode {
64 	POLLED,
65 	MCE,
66 	CMCI,
67 };
68 
69 #ifdef DEV_APIC
70 /*
71  * State maintained for each monitored MCx bank to control the
72  * corrected machine check interrupt threshold.
73  */
74 struct cmc_state {
75 	int	max_threshold;
76 	time_t	last_intr;
77 };
78 
79 struct amd_et_state {
80 	int	cur_threshold;
81 	time_t	last_intr;
82 };
83 #endif
84 
85 struct mca_internal {
86 	struct mca_record rec;
87 	int		logged;
88 	STAILQ_ENTRY(mca_internal) link;
89 };
90 
91 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
92 
93 static volatile int mca_count;	/* Number of records stored. */
94 static int mca_banks;		/* Number of per-CPU register banks. */
95 
96 static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL,
97     "Machine Check Architecture");
98 
99 static int mca_enabled = 1;
100 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
101     "Administrative toggle for machine check support");
102 
103 static int amd10h_L1TP = 1;
104 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
105     "Administrative toggle for logging of level one TLB parity (L1TP) errors");
106 
107 static int intel6h_HSD131;
108 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
109     "Administrative toggle for logging of spurious corrected errors");
110 
111 int workaround_erratum383;
112 SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
113     &workaround_erratum383, 0,
114     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
115 
116 static STAILQ_HEAD(, mca_internal) mca_freelist;
117 static int mca_freecount;
118 static STAILQ_HEAD(, mca_internal) mca_records;
119 static struct callout mca_timer;
120 static int mca_ticks = 3600;	/* Check hourly by default. */
121 static struct taskqueue *mca_tq;
122 static struct task mca_refill_task, mca_scan_task;
123 static struct mtx mca_lock;
124 
125 #ifdef DEV_APIC
126 static struct cmc_state **cmc_state;		/* Indexed by cpuid, bank. */
127 static struct amd_et_state *amd_et_state;	/* Indexed by cpuid. */
128 static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
129 
130 static int amd_elvt = -1;
131 
132 static inline bool
133 amd_thresholding_supported(void)
134 {
135 	return (cpu_vendor_id == CPU_VENDOR_AMD &&
136 	    CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16);
137 }
138 #endif
139 
140 static int
141 sysctl_positive_int(SYSCTL_HANDLER_ARGS)
142 {
143 	int error, value;
144 
145 	value = *(int *)arg1;
146 	error = sysctl_handle_int(oidp, &value, 0, req);
147 	if (error || req->newptr == NULL)
148 		return (error);
149 	if (value <= 0)
150 		return (EINVAL);
151 	*(int *)arg1 = value;
152 	return (0);
153 }
154 
155 static int
156 sysctl_mca_records(SYSCTL_HANDLER_ARGS)
157 {
158 	int *name = (int *)arg1;
159 	u_int namelen = arg2;
160 	struct mca_record record;
161 	struct mca_internal *rec;
162 	int i;
163 
164 	if (namelen != 1)
165 		return (EINVAL);
166 
167 	if (name[0] < 0 || name[0] >= mca_count)
168 		return (EINVAL);
169 
170 	mtx_lock_spin(&mca_lock);
171 	if (name[0] >= mca_count) {
172 		mtx_unlock_spin(&mca_lock);
173 		return (EINVAL);
174 	}
175 	i = 0;
176 	STAILQ_FOREACH(rec, &mca_records, link) {
177 		if (i == name[0]) {
178 			record = rec->rec;
179 			break;
180 		}
181 		i++;
182 	}
183 	mtx_unlock_spin(&mca_lock);
184 	return (SYSCTL_OUT(req, &record, sizeof(record)));
185 }
186 
187 static const char *
188 mca_error_ttype(uint16_t mca_error)
189 {
190 
191 	switch ((mca_error & 0x000c) >> 2) {
192 	case 0:
193 		return ("I");
194 	case 1:
195 		return ("D");
196 	case 2:
197 		return ("G");
198 	}
199 	return ("?");
200 }
201 
202 static const char *
203 mca_error_level(uint16_t mca_error)
204 {
205 
206 	switch (mca_error & 0x0003) {
207 	case 0:
208 		return ("L0");
209 	case 1:
210 		return ("L1");
211 	case 2:
212 		return ("L2");
213 	case 3:
214 		return ("LG");
215 	}
216 	return ("L?");
217 }
218 
219 static const char *
220 mca_error_request(uint16_t mca_error)
221 {
222 
223 	switch ((mca_error & 0x00f0) >> 4) {
224 	case 0x0:
225 		return ("ERR");
226 	case 0x1:
227 		return ("RD");
228 	case 0x2:
229 		return ("WR");
230 	case 0x3:
231 		return ("DRD");
232 	case 0x4:
233 		return ("DWR");
234 	case 0x5:
235 		return ("IRD");
236 	case 0x6:
237 		return ("PREFETCH");
238 	case 0x7:
239 		return ("EVICT");
240 	case 0x8:
241 		return ("SNOOP");
242 	}
243 	return ("???");
244 }
245 
246 static const char *
247 mca_error_mmtype(uint16_t mca_error)
248 {
249 
250 	switch ((mca_error & 0x70) >> 4) {
251 	case 0x0:
252 		return ("GEN");
253 	case 0x1:
254 		return ("RD");
255 	case 0x2:
256 		return ("WR");
257 	case 0x3:
258 		return ("AC");
259 	case 0x4:
260 		return ("MS");
261 	}
262 	return ("???");
263 }
264 
265 static int
266 mca_mute(const struct mca_record *rec)
267 {
268 
269 	/*
270 	 * Skip spurious corrected parity errors generated by Intel Haswell-
271 	 * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48
272 	 * erratum respectively), unless reporting is enabled.
273 	 * Note that these errors also have been observed with the D0-stepping
274 	 * of Haswell, while at least initially the CPU specification updates
275 	 * suggested only the C0-stepping to be affected.  Similarly, Celeron
276 	 * 2955U with a CPU ID of 0x45 apparently are also concerned with the
277 	 * same problem, with HSM142 only referring to 0x3c and 0x46.
278 	 */
279 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
280 	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
281 	    (CPUID_TO_MODEL(cpu_id) == 0x3c ||	/* HSD131, HSM142, HSW131 */
282 	    CPUID_TO_MODEL(cpu_id) == 0x3d ||	/* BDM48 */
283 	    CPUID_TO_MODEL(cpu_id) == 0x45 ||
284 	    CPUID_TO_MODEL(cpu_id) == 0x46) &&	/* HSM142 */
285 	    rec->mr_bank == 0 &&
286 	    (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 &&
287 	    !intel6h_HSD131)
288 	    	return (1);
289 
290 	return (0);
291 }
292 
293 /* Dump details about a single machine check. */
294 static void
295 mca_log(const struct mca_record *rec)
296 {
297 	uint16_t mca_error;
298 
299 	if (mca_mute(rec))
300 	    	return;
301 
302 	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
303 	    (long long)rec->mr_status);
304 	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
305 	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
306 	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
307 	    rec->mr_cpu_id, rec->mr_apic_id);
308 	printf("MCA: CPU %d ", rec->mr_cpu);
309 	if (rec->mr_status & MC_STATUS_UC)
310 		printf("UNCOR ");
311 	else {
312 		printf("COR ");
313 		if (rec->mr_mcg_cap & MCG_CAP_CMCI_P)
314 			printf("(%lld) ", ((long long)rec->mr_status &
315 			    MC_STATUS_COR_COUNT) >> 38);
316 	}
317 	if (rec->mr_status & MC_STATUS_PCC)
318 		printf("PCC ");
319 	if (rec->mr_status & MC_STATUS_OVER)
320 		printf("OVER ");
321 	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
322 	switch (mca_error) {
323 		/* Simple error codes. */
324 	case 0x0000:
325 		printf("no error");
326 		break;
327 	case 0x0001:
328 		printf("unclassified error");
329 		break;
330 	case 0x0002:
331 		printf("ucode ROM parity error");
332 		break;
333 	case 0x0003:
334 		printf("external error");
335 		break;
336 	case 0x0004:
337 		printf("FRC error");
338 		break;
339 	case 0x0005:
340 		printf("internal parity error");
341 		break;
342 	case 0x0400:
343 		printf("internal timer error");
344 		break;
345 	default:
346 		if ((mca_error & 0xfc00) == 0x0400) {
347 			printf("internal error %x", mca_error & 0x03ff);
348 			break;
349 		}
350 
351 		/* Compound error codes. */
352 
353 		/* Memory hierarchy error. */
354 		if ((mca_error & 0xeffc) == 0x000c) {
355 			printf("%s memory error", mca_error_level(mca_error));
356 			break;
357 		}
358 
359 		/* TLB error. */
360 		if ((mca_error & 0xeff0) == 0x0010) {
361 			printf("%sTLB %s error", mca_error_ttype(mca_error),
362 			    mca_error_level(mca_error));
363 			break;
364 		}
365 
366 		/* Memory controller error. */
367 		if ((mca_error & 0xef80) == 0x0080) {
368 			printf("%s channel ", mca_error_mmtype(mca_error));
369 			if ((mca_error & 0x000f) != 0x000f)
370 				printf("%d", mca_error & 0x000f);
371 			else
372 				printf("??");
373 			printf(" memory error");
374 			break;
375 		}
376 
377 		/* Cache error. */
378 		if ((mca_error & 0xef00) == 0x0100) {
379 			printf("%sCACHE %s %s error",
380 			    mca_error_ttype(mca_error),
381 			    mca_error_level(mca_error),
382 			    mca_error_request(mca_error));
383 			break;
384 		}
385 
386 		/* Bus and/or Interconnect error. */
387 		if ((mca_error & 0xe800) == 0x0800) {
388 			printf("BUS%s ", mca_error_level(mca_error));
389 			switch ((mca_error & 0x0600) >> 9) {
390 			case 0:
391 				printf("Source");
392 				break;
393 			case 1:
394 				printf("Responder");
395 				break;
396 			case 2:
397 				printf("Observer");
398 				break;
399 			default:
400 				printf("???");
401 				break;
402 			}
403 			printf(" %s ", mca_error_request(mca_error));
404 			switch ((mca_error & 0x000c) >> 2) {
405 			case 0:
406 				printf("Memory");
407 				break;
408 			case 2:
409 				printf("I/O");
410 				break;
411 			case 3:
412 				printf("Other");
413 				break;
414 			default:
415 				printf("???");
416 				break;
417 			}
418 			if (mca_error & 0x0100)
419 				printf(" timed out");
420 			break;
421 		}
422 
423 		printf("unknown error %x", mca_error);
424 		break;
425 	}
426 	printf("\n");
427 	if (rec->mr_status & MC_STATUS_ADDRV)
428 		printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
429 	if (rec->mr_status & MC_STATUS_MISCV)
430 		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
431 }
432 
433 static int
434 mca_check_status(int bank, struct mca_record *rec)
435 {
436 	uint64_t status;
437 	u_int p[4];
438 
439 	status = rdmsr(MSR_MC_STATUS(bank));
440 	if (!(status & MC_STATUS_VAL))
441 		return (0);
442 
443 	/* Save exception information. */
444 	rec->mr_status = status;
445 	rec->mr_bank = bank;
446 	rec->mr_addr = 0;
447 	if (status & MC_STATUS_ADDRV)
448 		rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
449 	rec->mr_misc = 0;
450 	if (status & MC_STATUS_MISCV)
451 		rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
452 	rec->mr_tsc = rdtsc();
453 	rec->mr_apic_id = PCPU_GET(apic_id);
454 	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
455 	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
456 	rec->mr_cpu_id = cpu_id;
457 	rec->mr_cpu_vendor_id = cpu_vendor_id;
458 	rec->mr_cpu = PCPU_GET(cpuid);
459 
460 	/*
461 	 * Clear machine check.  Don't do this for uncorrectable
462 	 * errors so that the BIOS can see them.
463 	 */
464 	if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
465 		wrmsr(MSR_MC_STATUS(bank), 0);
466 		do_cpuid(0, p);
467 	}
468 	return (1);
469 }
470 
471 static void
472 mca_fill_freelist(void)
473 {
474 	struct mca_internal *rec;
475 	int desired;
476 
477 	/*
478 	 * Ensure we have at least one record for each bank and one
479 	 * record per CPU.
480 	 */
481 	desired = imax(mp_ncpus, mca_banks);
482 	mtx_lock_spin(&mca_lock);
483 	while (mca_freecount < desired) {
484 		mtx_unlock_spin(&mca_lock);
485 		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
486 		mtx_lock_spin(&mca_lock);
487 		STAILQ_INSERT_TAIL(&mca_freelist, rec, link);
488 		mca_freecount++;
489 	}
490 	mtx_unlock_spin(&mca_lock);
491 }
492 
493 static void
494 mca_refill(void *context, int pending)
495 {
496 
497 	mca_fill_freelist();
498 }
499 
500 static void
501 mca_record_entry(enum scan_mode mode, const struct mca_record *record)
502 {
503 	struct mca_internal *rec;
504 
505 	if (mode == POLLED) {
506 		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
507 		mtx_lock_spin(&mca_lock);
508 	} else {
509 		mtx_lock_spin(&mca_lock);
510 		rec = STAILQ_FIRST(&mca_freelist);
511 		if (rec == NULL) {
512 			printf("MCA: Unable to allocate space for an event.\n");
513 			mca_log(record);
514 			mtx_unlock_spin(&mca_lock);
515 			return;
516 		}
517 		STAILQ_REMOVE_HEAD(&mca_freelist, link);
518 		mca_freecount--;
519 	}
520 
521 	rec->rec = *record;
522 	rec->logged = 0;
523 	STAILQ_INSERT_TAIL(&mca_records, rec, link);
524 	mca_count++;
525 	mtx_unlock_spin(&mca_lock);
526 	if (mode == CMCI && !cold)
527 		taskqueue_enqueue(mca_tq, &mca_refill_task);
528 }
529 
530 #ifdef DEV_APIC
531 /*
532  * Update the interrupt threshold for a CMCI.  The strategy is to use
533  * a low trigger that interrupts as soon as the first event occurs.
534  * However, if a steady stream of events arrive, the threshold is
535  * increased until the interrupts are throttled to once every
536  * cmc_throttle seconds or the periodic scan.  If a periodic scan
537  * finds that the threshold is too high, it is lowered.
538  */
539 static int
540 update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
541     int cur_threshold, int max_threshold)
542 {
543 	u_int delta;
544 	int limit;
545 
546 	delta = (u_int)(time_uptime - last_intr);
547 	limit = cur_threshold;
548 
549 	/*
550 	 * If an interrupt was received less than cmc_throttle seconds
551 	 * since the previous interrupt and the count from the current
552 	 * event is greater than or equal to the current threshold,
553 	 * double the threshold up to the max.
554 	 */
555 	if (mode == CMCI && valid) {
556 		if (delta < cmc_throttle && count >= limit &&
557 		    limit < max_threshold) {
558 			limit = min(limit << 1, max_threshold);
559 		}
560 		return (limit);
561 	}
562 
563 	/*
564 	 * When the banks are polled, check to see if the threshold
565 	 * should be lowered.
566 	 */
567 	if (mode != POLLED)
568 		return (limit);
569 
570 	/* If a CMCI occured recently, do nothing for now. */
571 	if (delta < cmc_throttle)
572 		return (limit);
573 
574 	/*
575 	 * Compute a new limit based on the average rate of events per
576 	 * cmc_throttle seconds since the last interrupt.
577 	 */
578 	if (valid) {
579 		limit = count * cmc_throttle / delta;
580 		if (limit <= 0)
581 			limit = 1;
582 		else if (limit > max_threshold)
583 			limit = max_threshold;
584 	} else {
585 		limit = 1;
586 	}
587 	return (limit);
588 }
589 
590 static void
591 cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
592 {
593 	struct cmc_state *cc;
594 	uint64_t ctl;
595 	int cur_threshold, new_threshold;
596 	int count;
597 
598 	/* Fetch the current limit for this bank. */
599 	cc = &cmc_state[PCPU_GET(cpuid)][bank];
600 	ctl = rdmsr(MSR_MC_CTL2(bank));
601 	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
602 	cur_threshold = ctl & MC_CTL2_THRESHOLD;
603 
604 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
605 	    cur_threshold, cc->max_threshold);
606 
607 	if (mode == CMCI && valid)
608 		cc->last_intr = time_uptime;
609 	if (new_threshold != cur_threshold) {
610 		ctl &= ~MC_CTL2_THRESHOLD;
611 		ctl |= new_threshold;
612 		wrmsr(MSR_MC_CTL2(bank), ctl);
613 	}
614 }
615 
616 static void
617 amd_thresholding_update(enum scan_mode mode, int bank, int valid)
618 {
619 	struct amd_et_state *cc;
620 	uint64_t misc;
621 	int new_threshold;
622 	int count;
623 
624 	KASSERT(bank == MC_AMDNB_BANK,
625 	    ("%s: unexpected bank %d", __func__, bank));
626 	cc = &amd_et_state[PCPU_GET(cpuid)];
627 	misc = rdmsr(MSR_MC_MISC(bank));
628 	count = (misc & MC_MISC_AMDNB_CNT_MASK) >> MC_MISC_AMDNB_CNT_SHIFT;
629 	count = count - (MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold);
630 
631 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
632 	    cc->cur_threshold, MC_MISC_AMDNB_CNT_MAX);
633 
634 	cc->cur_threshold = new_threshold;
635 	misc &= ~MC_MISC_AMDNB_CNT_MASK;
636 	misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
637 	    << MC_MISC_AMDNB_CNT_SHIFT;
638 	misc &= ~MC_MISC_AMDNB_OVERFLOW;
639 	wrmsr(MSR_MC_MISC(bank), misc);
640 	if (mode == CMCI && valid)
641 		cc->last_intr = time_uptime;
642 }
643 #endif
644 
645 /*
646  * This scans all the machine check banks of the current CPU to see if
647  * there are any machine checks.  Any non-recoverable errors are
648  * reported immediately via mca_log().  The current thread must be
649  * pinned when this is called.  The 'mode' parameter indicates if we
650  * are being called from the MC exception handler, the CMCI handler,
651  * or the periodic poller.  In the MC exception case this function
652  * returns true if the system is restartable.  Otherwise, it returns a
653  * count of the number of valid MC records found.
654  */
655 static int
656 mca_scan(enum scan_mode mode, int *recoverablep)
657 {
658 	struct mca_record rec;
659 	uint64_t mcg_cap, ucmask;
660 	int count, i, recoverable, valid;
661 
662 	count = 0;
663 	recoverable = 1;
664 	ucmask = MC_STATUS_UC | MC_STATUS_PCC;
665 
666 	/* When handling a MCE#, treat the OVER flag as non-restartable. */
667 	if (mode == MCE)
668 		ucmask |= MC_STATUS_OVER;
669 	mcg_cap = rdmsr(MSR_MCG_CAP);
670 	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
671 #ifdef DEV_APIC
672 		/*
673 		 * For a CMCI, only check banks this CPU is
674 		 * responsible for.
675 		 */
676 		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
677 			continue;
678 #endif
679 
680 		valid = mca_check_status(i, &rec);
681 		if (valid) {
682 			count++;
683 			if (rec.mr_status & ucmask) {
684 				recoverable = 0;
685 				mtx_lock_spin(&mca_lock);
686 				mca_log(&rec);
687 				mtx_unlock_spin(&mca_lock);
688 			}
689 			mca_record_entry(mode, &rec);
690 		}
691 
692 #ifdef DEV_APIC
693 		/*
694 		 * If this is a bank this CPU monitors via CMCI,
695 		 * update the threshold.
696 		 */
697 		if (PCPU_GET(cmci_mask) & 1 << i) {
698 			if (cmc_state != NULL)
699 				cmci_update(mode, i, valid, &rec);
700 			else
701 				amd_thresholding_update(mode, i, valid);
702 		}
703 #endif
704 	}
705 	if (mode == POLLED)
706 		mca_fill_freelist();
707 	if (recoverablep != NULL)
708 		*recoverablep = recoverable;
709 	return (count);
710 }
711 
712 /*
713  * Scan the machine check banks on all CPUs by binding to each CPU in
714  * turn.  If any of the CPUs contained new machine check records, log
715  * them to the console.
716  */
717 static void
718 mca_scan_cpus(void *context, int pending)
719 {
720 	struct mca_internal *mca;
721 	struct thread *td;
722 	int count, cpu;
723 
724 	mca_fill_freelist();
725 	td = curthread;
726 	count = 0;
727 	thread_lock(td);
728 	CPU_FOREACH(cpu) {
729 		sched_bind(td, cpu);
730 		thread_unlock(td);
731 		count += mca_scan(POLLED, NULL);
732 		thread_lock(td);
733 		sched_unbind(td);
734 	}
735 	thread_unlock(td);
736 	if (count != 0) {
737 		mtx_lock_spin(&mca_lock);
738 		STAILQ_FOREACH(mca, &mca_records, link) {
739 			if (!mca->logged) {
740 				mca->logged = 1;
741 				mca_log(&mca->rec);
742 			}
743 		}
744 		mtx_unlock_spin(&mca_lock);
745 	}
746 }
747 
748 static void
749 mca_periodic_scan(void *arg)
750 {
751 
752 	taskqueue_enqueue(mca_tq, &mca_scan_task);
753 	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
754 }
755 
756 static int
757 sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
758 {
759 	int error, i;
760 
761 	i = 0;
762 	error = sysctl_handle_int(oidp, &i, 0, req);
763 	if (error)
764 		return (error);
765 	if (i)
766 		taskqueue_enqueue(mca_tq, &mca_scan_task);
767 	return (0);
768 }
769 
770 static void
771 mca_createtq(void *dummy)
772 {
773 	if (mca_banks <= 0)
774 		return;
775 
776 	mca_tq = taskqueue_create_fast("mca", M_WAITOK,
777 	    taskqueue_thread_enqueue, &mca_tq);
778 	taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
779 
780 	/* CMCIs during boot may have claimed items from the freelist. */
781 	mca_fill_freelist();
782 }
783 SYSINIT(mca_createtq, SI_SUB_CONFIGURE, SI_ORDER_ANY, mca_createtq, NULL);
784 
785 static void
786 mca_startup(void *dummy)
787 {
788 
789 	if (mca_banks <= 0)
790 		return;
791 
792 	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
793 }
794 #ifdef EARLY_AP_STARTUP
795 SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
796 #else
797 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
798 #endif
799 
800 #ifdef DEV_APIC
801 static void
802 cmci_setup(void)
803 {
804 	int i;
805 
806 	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA,
807 	    M_WAITOK);
808 	for (i = 0; i <= mp_maxid; i++)
809 		cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
810 		    M_MCA, M_WAITOK | M_ZERO);
811 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
812 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
813 	    &cmc_throttle, 0, sysctl_positive_int, "I",
814 	    "Interval in seconds to throttle corrected MC interrupts");
815 }
816 
817 static void
818 amd_thresholding_setup(void)
819 {
820 
821 	amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state),
822 	    M_MCA, M_WAITOK | M_ZERO);
823 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
824 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
825 	    &cmc_throttle, 0, sysctl_positive_int, "I",
826 	    "Interval in seconds to throttle corrected MC interrupts");
827 }
828 #endif
829 
830 static void
831 mca_setup(uint64_t mcg_cap)
832 {
833 
834 	/*
835 	 * On AMD Family 10h processors, unless logging of level one TLB
836 	 * parity (L1TP) errors is disabled, enable the recommended workaround
837 	 * for Erratum 383.
838 	 */
839 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
840 	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
841 		workaround_erratum383 = 1;
842 
843 	mca_banks = mcg_cap & MCG_CAP_COUNT;
844 	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
845 	STAILQ_INIT(&mca_records);
846 	TASK_INIT(&mca_scan_task, 0, mca_scan_cpus, NULL);
847 	callout_init(&mca_timer, 1);
848 	STAILQ_INIT(&mca_freelist);
849 	TASK_INIT(&mca_refill_task, 0, mca_refill, NULL);
850 	mca_fill_freelist();
851 	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
852 	    "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
853 	    "Record count");
854 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
855 	    "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
856 	    0, sysctl_positive_int, "I",
857 	    "Periodic interval in seconds to scan for machine checks");
858 	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
859 	    "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
860 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
861 	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
862 	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
863 #ifdef DEV_APIC
864 	if (mcg_cap & MCG_CAP_CMCI_P)
865 		cmci_setup();
866 	else if (amd_thresholding_supported())
867 		amd_thresholding_setup();
868 #endif
869 }
870 
871 #ifdef DEV_APIC
872 /*
873  * See if we should monitor CMCI for this bank.  If CMCI_EN is already
874  * set in MC_CTL2, then another CPU is responsible for this bank, so
875  * ignore it.  If CMCI_EN returns zero after being set, then this bank
876  * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
877  * now monitor this bank.
878  */
879 static void
880 cmci_monitor(int i)
881 {
882 	struct cmc_state *cc;
883 	uint64_t ctl;
884 
885 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
886 
887 	ctl = rdmsr(MSR_MC_CTL2(i));
888 	if (ctl & MC_CTL2_CMCI_EN)
889 		/* Already monitored by another CPU. */
890 		return;
891 
892 	/* Set the threshold to one event for now. */
893 	ctl &= ~MC_CTL2_THRESHOLD;
894 	ctl |= MC_CTL2_CMCI_EN | 1;
895 	wrmsr(MSR_MC_CTL2(i), ctl);
896 	ctl = rdmsr(MSR_MC_CTL2(i));
897 	if (!(ctl & MC_CTL2_CMCI_EN))
898 		/* This bank does not support CMCI. */
899 		return;
900 
901 	cc = &cmc_state[PCPU_GET(cpuid)][i];
902 
903 	/* Determine maximum threshold. */
904 	ctl &= ~MC_CTL2_THRESHOLD;
905 	ctl |= 0x7fff;
906 	wrmsr(MSR_MC_CTL2(i), ctl);
907 	ctl = rdmsr(MSR_MC_CTL2(i));
908 	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
909 
910 	/* Start off with a threshold of 1. */
911 	ctl &= ~MC_CTL2_THRESHOLD;
912 	ctl |= 1;
913 	wrmsr(MSR_MC_CTL2(i), ctl);
914 
915 	/* Mark this bank as monitored. */
916 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
917 }
918 
919 /*
920  * For resume, reset the threshold for any banks we monitor back to
921  * one and throw away the timestamp of the last interrupt.
922  */
923 static void
924 cmci_resume(int i)
925 {
926 	struct cmc_state *cc;
927 	uint64_t ctl;
928 
929 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
930 
931 	/* Ignore banks not monitored by this CPU. */
932 	if (!(PCPU_GET(cmci_mask) & 1 << i))
933 		return;
934 
935 	cc = &cmc_state[PCPU_GET(cpuid)][i];
936 	cc->last_intr = 0;
937 	ctl = rdmsr(MSR_MC_CTL2(i));
938 	ctl &= ~MC_CTL2_THRESHOLD;
939 	ctl |= MC_CTL2_CMCI_EN | 1;
940 	wrmsr(MSR_MC_CTL2(i), ctl);
941 }
942 
943 static void
944 amd_thresholding_start(struct amd_et_state *cc)
945 {
946 	uint64_t misc;
947 
948 	KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
949 	misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
950 	misc &= ~MC_MISC_AMDNB_INT_MASK;
951 	misc |= MC_MISC_AMDNB_INT_LVT;
952 	misc &= ~MC_MISC_AMDNB_LVT_MASK;
953 	misc |= (uint64_t)amd_elvt << MC_MISC_AMDNB_LVT_SHIFT;
954 	misc &= ~MC_MISC_AMDNB_CNT_MASK;
955 	misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
956 	    << MC_MISC_AMDNB_CNT_SHIFT;
957 	misc &= ~MC_MISC_AMDNB_OVERFLOW;
958 	misc |= MC_MISC_AMDNB_CNTEN;
959 
960 	wrmsr(MSR_MC_MISC(MC_AMDNB_BANK), misc);
961 }
962 
963 static void
964 amd_thresholding_init(void)
965 {
966 	struct amd_et_state *cc;
967 	uint64_t misc;
968 
969 	/* The counter must be valid and present. */
970 	misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
971 	if ((misc & (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP)) !=
972 	    (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP))
973 		return;
974 
975 	/* The register should not be locked. */
976 	if ((misc & MC_MISC_AMDNB_LOCK) != 0)
977 		return;
978 
979 	/*
980 	 * If counter is enabled then either the firmware or another CPU
981 	 * has already claimed it.
982 	 */
983 	if ((misc & MC_MISC_AMDNB_CNTEN) != 0)
984 		return;
985 
986 	/*
987 	 * Configure an Extended Interrupt LVT register for reporting
988 	 * counter overflows if that feature is supported and the first
989 	 * extended register is available.
990 	 */
991 	amd_elvt = lapic_enable_mca_elvt();
992 	if (amd_elvt < 0)
993 		return;
994 
995 	/* Re-use Intel CMC support infrastructure. */
996 	cc = &amd_et_state[PCPU_GET(cpuid)];
997 	cc->cur_threshold = 1;
998 	amd_thresholding_start(cc);
999 
1000 	/* Mark the NB bank as monitored. */
1001 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << MC_AMDNB_BANK);
1002 }
1003 
1004 static void
1005 amd_thresholding_resume(void)
1006 {
1007 	struct amd_et_state *cc;
1008 
1009 	/* Nothing to do if this CPU doesn't monitor the NB bank. */
1010 	if ((PCPU_GET(cmci_mask) & 1 << MC_AMDNB_BANK) == 0)
1011 		return;
1012 
1013 	cc = &amd_et_state[PCPU_GET(cpuid)];
1014 	cc->last_intr = 0;
1015 	cc->cur_threshold = 1;
1016 	amd_thresholding_start(cc);
1017 }
1018 #endif
1019 
1020 /*
1021  * Initializes per-CPU machine check registers and enables corrected
1022  * machine check interrupts.
1023  */
1024 static void
1025 _mca_init(int boot)
1026 {
1027 	uint64_t mcg_cap;
1028 	uint64_t ctl, mask;
1029 	int i, skip;
1030 
1031 	/* MCE is required. */
1032 	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
1033 		return;
1034 
1035 	if (cpu_feature & CPUID_MCA) {
1036 		if (boot)
1037 			PCPU_SET(cmci_mask, 0);
1038 
1039 		mcg_cap = rdmsr(MSR_MCG_CAP);
1040 		if (mcg_cap & MCG_CAP_CTL_P)
1041 			/* Enable MCA features. */
1042 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
1043 		if (PCPU_GET(cpuid) == 0 && boot)
1044 			mca_setup(mcg_cap);
1045 
1046 		/*
1047 		 * Disable logging of level one TLB parity (L1TP) errors by
1048 		 * the data cache as an alternative workaround for AMD Family
1049 		 * 10h Erratum 383.  Unlike the recommended workaround, there
1050 		 * is no performance penalty to this workaround.  However,
1051 		 * L1TP errors will go unreported.
1052 		 */
1053 		if (cpu_vendor_id == CPU_VENDOR_AMD &&
1054 		    CPUID_TO_FAMILY(cpu_id) == 0x10 && !amd10h_L1TP) {
1055 			mask = rdmsr(MSR_MC0_CTL_MASK);
1056 			if ((mask & (1UL << 5)) == 0)
1057 				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
1058 		}
1059 		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
1060 			/* By default enable logging of all errors. */
1061 			ctl = 0xffffffffffffffffUL;
1062 			skip = 0;
1063 
1064 			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
1065 				/*
1066 				 * For P6 models before Nehalem MC0_CTL is
1067 				 * always enabled and reserved.
1068 				 */
1069 				if (i == 0 && CPUID_TO_FAMILY(cpu_id) == 0x6
1070 				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
1071 					skip = 1;
1072 			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
1073 				/* BKDG for Family 10h: unset GartTblWkEn. */
1074 				if (i == 4 && CPUID_TO_FAMILY(cpu_id) >= 0xf)
1075 					ctl &= ~(1UL << 10);
1076 			}
1077 
1078 			if (!skip)
1079 				wrmsr(MSR_MC_CTL(i), ctl);
1080 
1081 #ifdef DEV_APIC
1082 			if (mcg_cap & MCG_CAP_CMCI_P) {
1083 				if (boot)
1084 					cmci_monitor(i);
1085 				else
1086 					cmci_resume(i);
1087 			}
1088 #endif
1089 
1090 			/* Clear all errors. */
1091 			wrmsr(MSR_MC_STATUS(i), 0);
1092 		}
1093 
1094 #ifdef DEV_APIC
1095 		/*
1096 		 * AMD Processors from families 10h - 16h provide support
1097 		 * for Machine Check Error Thresholding.
1098 		 * The processors support counters of MC errors and they
1099 		 * can be configured to generate an interrupt when a counter
1100 		 * overflows.
1101 		 * The counters are all associated with Bank 4 and each
1102 		 * of them covers a group of errors reported via that bank.
1103 		 * At the moment only the DRAM Error Threshold Group is
1104 		 * supported.
1105 		 */
1106 		if (amd_thresholding_supported() &&
1107 		    (mcg_cap & MCG_CAP_COUNT) >= 4) {
1108 			if (boot)
1109 				amd_thresholding_init();
1110 			else
1111 				amd_thresholding_resume();
1112 		} else if (PCPU_GET(cmci_mask) != 0 && boot) {
1113 			lapic_enable_cmc();
1114 		}
1115 #endif
1116 	}
1117 
1118 	load_cr4(rcr4() | CR4_MCE);
1119 }
1120 
1121 /* Must be executed on each CPU during boot. */
1122 void
1123 mca_init(void)
1124 {
1125 
1126 	_mca_init(1);
1127 }
1128 
1129 /* Must be executed on each CPU during resume. */
1130 void
1131 mca_resume(void)
1132 {
1133 
1134 	_mca_init(0);
1135 }
1136 
1137 /*
1138  * The machine check registers for the BSP cannot be initialized until
1139  * the local APIC is initialized.  This happens at SI_SUB_CPU,
1140  * SI_ORDER_SECOND.
1141  */
1142 static void
1143 mca_init_bsp(void *arg __unused)
1144 {
1145 
1146 	mca_init();
1147 }
1148 SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
1149 
1150 /* Called when a machine check exception fires. */
1151 void
1152 mca_intr(void)
1153 {
1154 	uint64_t mcg_status;
1155 	int recoverable, count;
1156 
1157 	if (!(cpu_feature & CPUID_MCA)) {
1158 		/*
1159 		 * Just print the values of the old Pentium registers
1160 		 * and panic.
1161 		 */
1162 		printf("MC Type: 0x%jx  Address: 0x%jx\n",
1163 		    (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
1164 		    (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
1165 		panic("Machine check");
1166 	}
1167 
1168 	/* Scan the banks and check for any non-recoverable errors. */
1169 	count = mca_scan(MCE, &recoverable);
1170 	mcg_status = rdmsr(MSR_MCG_STATUS);
1171 	if (!(mcg_status & MCG_STATUS_RIPV))
1172 		recoverable = 0;
1173 
1174 	if (!recoverable) {
1175 		/*
1176 		 * Only panic if the error was detected local to this CPU.
1177 		 * Some errors will assert a machine check on all CPUs, but
1178 		 * only certain CPUs will find a valid bank to log.
1179 		 */
1180 		while (count == 0)
1181 			cpu_spinwait();
1182 
1183 		panic("Unrecoverable machine check exception");
1184 	}
1185 
1186 	/* Clear MCIP. */
1187 	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
1188 }
1189 
1190 #ifdef DEV_APIC
1191 /* Called for a CMCI (correctable machine check interrupt). */
1192 void
1193 cmc_intr(void)
1194 {
1195 	struct mca_internal *mca;
1196 	int count;
1197 
1198 	/*
1199 	 * Serialize MCA bank scanning to prevent collisions from
1200 	 * sibling threads.
1201 	 */
1202 	count = mca_scan(CMCI, NULL);
1203 
1204 	/* If we found anything, log them to the console. */
1205 	if (count != 0) {
1206 		mtx_lock_spin(&mca_lock);
1207 		STAILQ_FOREACH(mca, &mca_records, link) {
1208 			if (!mca->logged) {
1209 				mca->logged = 1;
1210 				mca_log(&mca->rec);
1211 			}
1212 		}
1213 		mtx_unlock_spin(&mca_lock);
1214 	}
1215 }
1216 #endif
1217