xref: /linux/drivers/edac/mce_amd.c (revision f86fd32d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4 
5 #include <asm/cpu.h>
6 
7 #include "mce_amd.h"
8 
9 static struct amd_decoder_ops fam_ops;
10 
11 static u8 xec_mask	 = 0xf;
12 
13 static bool report_gart_errors;
14 static void (*decode_dram_ecc)(int node_id, struct mce *m);
15 
16 void amd_report_gart_errors(bool v)
17 {
18 	report_gart_errors = v;
19 }
20 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
21 
22 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
23 {
24 	decode_dram_ecc = f;
25 }
26 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
27 
28 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
29 {
30 	if (decode_dram_ecc) {
31 		WARN_ON(decode_dram_ecc != f);
32 
33 		decode_dram_ecc = NULL;
34 	}
35 }
36 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 
38 /*
39  * string representation for the different MCA reported error types, see F3x48
40  * or MSR0000_0411.
41  */
42 
43 /* transaction type */
44 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
45 
46 /* cache level */
47 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
48 
49 /* memory transaction type */
50 static const char * const rrrr_msgs[] = {
51        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
52 };
53 
54 /* participating processor */
55 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
56 EXPORT_SYMBOL_GPL(pp_msgs);
57 
58 /* request timeout */
59 static const char * const to_msgs[] = { "no timeout", "timed out" };
60 
61 /* memory or i/o */
62 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
63 
64 /* internal error type */
65 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
66 
67 static const char * const f15h_mc1_mce_desc[] = {
68 	"UC during a demand linefill from L2",
69 	"Parity error during data load from IC",
70 	"Parity error for IC valid bit",
71 	"Main tag parity error",
72 	"Parity error in prediction queue",
73 	"PFB data/address parity error",
74 	"Parity error in the branch status reg",
75 	"PFB promotion address error",
76 	"Tag error during probe/victimization",
77 	"Parity error for IC probe tag valid bit",
78 	"PFB non-cacheable bit parity error",
79 	"PFB valid bit parity error",			/* xec = 0xd */
80 	"Microcode Patch Buffer",			/* xec = 010 */
81 	"uop queue",
82 	"insn buffer",
83 	"predecode buffer",
84 	"fetch address FIFO",
85 	"dispatch uop queue"
86 };
87 
88 static const char * const f15h_mc2_mce_desc[] = {
89 	"Fill ECC error on data fills",			/* xec = 0x4 */
90 	"Fill parity error on insn fills",
91 	"Prefetcher request FIFO parity error",
92 	"PRQ address parity error",
93 	"PRQ data parity error",
94 	"WCC Tag ECC error",
95 	"WCC Data ECC error",
96 	"WCB Data parity error",
97 	"VB Data ECC or parity error",
98 	"L2 Tag ECC error",				/* xec = 0x10 */
99 	"Hard L2 Tag ECC error",
100 	"Multiple hits on L2 tag",
101 	"XAB parity error",
102 	"PRB address parity error"
103 };
104 
105 static const char * const mc4_mce_desc[] = {
106 	"DRAM ECC error detected on the NB",
107 	"CRC error detected on HT link",
108 	"Link-defined sync error packets detected on HT link",
109 	"HT Master abort",
110 	"HT Target abort",
111 	"Invalid GART PTE entry during GART table walk",
112 	"Unsupported atomic RMW received from an IO link",
113 	"Watchdog timeout due to lack of progress",
114 	"DRAM ECC error detected on the NB",
115 	"SVM DMA Exclusion Vector error",
116 	"HT data error detected on link",
117 	"Protocol error (link, L3, probe filter)",
118 	"NB internal arrays parity error",
119 	"DRAM addr/ctl signals parity error",
120 	"IO link transmission error",
121 	"L3 data cache ECC error",			/* xec = 0x1c */
122 	"L3 cache tag error",
123 	"L3 LRU parity bits error",
124 	"ECC Error in the Probe Filter directory"
125 };
126 
127 static const char * const mc5_mce_desc[] = {
128 	"CPU Watchdog timer expire",
129 	"Wakeup array dest tag",
130 	"AG payload array",
131 	"EX payload array",
132 	"IDRF array",
133 	"Retire dispatch queue",
134 	"Mapper checkpoint array",
135 	"Physical register file EX0 port",
136 	"Physical register file EX1 port",
137 	"Physical register file AG0 port",
138 	"Physical register file AG1 port",
139 	"Flag register file",
140 	"DE error occurred",
141 	"Retire status queue"
142 };
143 
144 static const char * const mc6_mce_desc[] = {
145 	"Hardware Assertion",
146 	"Free List",
147 	"Physical Register File",
148 	"Retire Queue",
149 	"Scheduler table",
150 	"Status Register File",
151 };
152 
153 /* Scalable MCA error strings */
154 static const char * const smca_ls_mce_desc[] = {
155 	"Load queue parity error",
156 	"Store queue parity error",
157 	"Miss address buffer payload parity error",
158 	"Level 1 TLB parity error",
159 	"DC Tag error type 5",
160 	"DC Tag error type 6",
161 	"DC Tag error type 1",
162 	"Internal error type 1",
163 	"Internal error type 2",
164 	"System Read Data Error Thread 0",
165 	"System Read Data Error Thread 1",
166 	"DC Tag error type 2",
167 	"DC Data error type 1 and poison consumption",
168 	"DC Data error type 2",
169 	"DC Data error type 3",
170 	"DC Tag error type 4",
171 	"Level 2 TLB parity error",
172 	"PDC parity error",
173 	"DC Tag error type 3",
174 	"DC Tag error type 5",
175 	"L2 Fill Data error",
176 };
177 
178 static const char * const smca_ls2_mce_desc[] = {
179 	"An ECC error was detected on a data cache read by a probe or victimization",
180 	"An ECC error or L2 poison was detected on a data cache read by a load",
181 	"An ECC error was detected on a data cache read-modify-write by a store",
182 	"An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
183 	"An ECC error or poison bit mismatch was detected on a tag read by a load",
184 	"An ECC error or poison bit mismatch was detected on a tag read by a store",
185 	"An ECC error was detected on an EMEM read by a load",
186 	"An ECC error was detected on an EMEM read-modify-write by a store",
187 	"A parity error was detected in an L1 TLB entry by any access",
188 	"A parity error was detected in an L2 TLB entry by any access",
189 	"A parity error was detected in a PWC entry by any access",
190 	"A parity error was detected in an STQ entry by any access",
191 	"A parity error was detected in an LDQ entry by any access",
192 	"A parity error was detected in a MAB entry by any access",
193 	"A parity error was detected in an SCB entry state field by any access",
194 	"A parity error was detected in an SCB entry address field by any access",
195 	"A parity error was detected in an SCB entry data field by any access",
196 	"A parity error was detected in a WCB entry by any access",
197 	"A poisoned line was detected in an SCB entry by any access",
198 	"A SystemReadDataError error was reported on read data returned from L2 for a load",
199 	"A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
200 	"A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
201 	"A hardware assertion error was reported",
202 	"A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
203 };
204 
205 static const char * const smca_if_mce_desc[] = {
206 	"Op Cache Microtag Probe Port Parity Error",
207 	"IC Microtag or Full Tag Multi-hit Error",
208 	"IC Full Tag Parity Error",
209 	"IC Data Array Parity Error",
210 	"Decoupling Queue PhysAddr Parity Error",
211 	"L0 ITLB Parity Error",
212 	"L1 ITLB Parity Error",
213 	"L2 ITLB Parity Error",
214 	"BPQ Thread 0 Snoop Parity Error",
215 	"BPQ Thread 1 Snoop Parity Error",
216 	"L1 BTB Multi-Match Error",
217 	"L2 BTB Multi-Match Error",
218 	"L2 Cache Response Poison Error",
219 	"System Read Data Error",
220 };
221 
222 static const char * const smca_l2_mce_desc[] = {
223 	"L2M Tag Multiple-Way-Hit error",
224 	"L2M Tag or State Array ECC Error",
225 	"L2M Data Array ECC Error",
226 	"Hardware Assert Error",
227 };
228 
229 static const char * const smca_de_mce_desc[] = {
230 	"Micro-op cache tag parity error",
231 	"Micro-op cache data parity error",
232 	"Instruction buffer parity error",
233 	"Micro-op queue parity error",
234 	"Instruction dispatch queue parity error",
235 	"Fetch address FIFO parity error",
236 	"Patch RAM data parity error",
237 	"Patch RAM sequencer parity error",
238 	"Micro-op buffer parity error"
239 };
240 
241 static const char * const smca_ex_mce_desc[] = {
242 	"Watchdog Timeout error",
243 	"Physical register file parity error",
244 	"Flag register file parity error",
245 	"Immediate displacement register file parity error",
246 	"Address generator payload parity error",
247 	"EX payload parity error",
248 	"Checkpoint queue parity error",
249 	"Retire dispatch queue parity error",
250 	"Retire status queue parity error",
251 	"Scheduling queue parity error",
252 	"Branch buffer queue parity error",
253 	"Hardware Assertion error",
254 };
255 
256 static const char * const smca_fp_mce_desc[] = {
257 	"Physical register file (PRF) parity error",
258 	"Freelist (FL) parity error",
259 	"Schedule queue parity error",
260 	"NSQ parity error",
261 	"Retire queue (RQ) parity error",
262 	"Status register file (SRF) parity error",
263 	"Hardware assertion",
264 };
265 
266 static const char * const smca_l3_mce_desc[] = {
267 	"Shadow Tag Macro ECC Error",
268 	"Shadow Tag Macro Multi-way-hit Error",
269 	"L3M Tag ECC Error",
270 	"L3M Tag Multi-way-hit Error",
271 	"L3M Data ECC Error",
272 	"SDP Parity Error or SystemReadDataError from XI",
273 	"L3 Victim Queue Parity Error",
274 	"L3 Hardware Assertion",
275 };
276 
277 static const char * const smca_cs_mce_desc[] = {
278 	"Illegal Request",
279 	"Address Violation",
280 	"Security Violation",
281 	"Illegal Response",
282 	"Unexpected Response",
283 	"Request or Probe Parity Error",
284 	"Read Response Parity Error",
285 	"Atomic Request Parity Error",
286 	"Probe Filter ECC Error",
287 };
288 
289 static const char * const smca_cs2_mce_desc[] = {
290 	"Illegal Request",
291 	"Address Violation",
292 	"Security Violation",
293 	"Illegal Response",
294 	"Unexpected Response",
295 	"Request or Probe Parity Error",
296 	"Read Response Parity Error",
297 	"Atomic Request Parity Error",
298 	"SDP read response had no match in the CS queue",
299 	"Probe Filter Protocol Error",
300 	"Probe Filter ECC Error",
301 	"SDP read response had an unexpected RETRY error",
302 	"Counter overflow error",
303 	"Counter underflow error",
304 };
305 
306 static const char * const smca_pie_mce_desc[] = {
307 	"Hardware Assert",
308 	"Register security violation",
309 	"Link Error",
310 	"Poison data consumption",
311 	"A deferred error was detected in the DF"
312 };
313 
314 static const char * const smca_umc_mce_desc[] = {
315 	"DRAM ECC error",
316 	"Data poison error",
317 	"SDP parity error",
318 	"Advanced peripheral bus error",
319 	"Address/Command parity error",
320 	"Write data CRC error",
321 	"DCQ SRAM ECC error",
322 	"AES SRAM ECC error",
323 };
324 
325 static const char * const smca_pb_mce_desc[] = {
326 	"An ECC error in the Parameter Block RAM array",
327 };
328 
329 static const char * const smca_psp_mce_desc[] = {
330 	"An ECC or parity error in a PSP RAM instance",
331 };
332 
333 static const char * const smca_psp2_mce_desc[] = {
334 	"High SRAM ECC or parity error",
335 	"Low SRAM ECC or parity error",
336 	"Instruction Cache Bank 0 ECC or parity error",
337 	"Instruction Cache Bank 1 ECC or parity error",
338 	"Instruction Tag Ram 0 parity error",
339 	"Instruction Tag Ram 1 parity error",
340 	"Data Cache Bank 0 ECC or parity error",
341 	"Data Cache Bank 1 ECC or parity error",
342 	"Data Cache Bank 2 ECC or parity error",
343 	"Data Cache Bank 3 ECC or parity error",
344 	"Data Tag Bank 0 parity error",
345 	"Data Tag Bank 1 parity error",
346 	"Data Tag Bank 2 parity error",
347 	"Data Tag Bank 3 parity error",
348 	"Dirty Data Ram parity error",
349 	"TLB Bank 0 parity error",
350 	"TLB Bank 1 parity error",
351 	"System Hub Read Buffer ECC or parity error",
352 };
353 
354 static const char * const smca_smu_mce_desc[] = {
355 	"An ECC or parity error in an SMU RAM instance",
356 };
357 
358 static const char * const smca_smu2_mce_desc[] = {
359 	"High SRAM ECC or parity error",
360 	"Low SRAM ECC or parity error",
361 	"Data Cache Bank A ECC or parity error",
362 	"Data Cache Bank B ECC or parity error",
363 	"Data Tag Cache Bank A ECC or parity error",
364 	"Data Tag Cache Bank B ECC or parity error",
365 	"Instruction Cache Bank A ECC or parity error",
366 	"Instruction Cache Bank B ECC or parity error",
367 	"Instruction Tag Cache Bank A ECC or parity error",
368 	"Instruction Tag Cache Bank B ECC or parity error",
369 	"System Hub Read Buffer ECC or parity error",
370 };
371 
372 static const char * const smca_mp5_mce_desc[] = {
373 	"High SRAM ECC or parity error",
374 	"Low SRAM ECC or parity error",
375 	"Data Cache Bank A ECC or parity error",
376 	"Data Cache Bank B ECC or parity error",
377 	"Data Tag Cache Bank A ECC or parity error",
378 	"Data Tag Cache Bank B ECC or parity error",
379 	"Instruction Cache Bank A ECC or parity error",
380 	"Instruction Cache Bank B ECC or parity error",
381 	"Instruction Tag Cache Bank A ECC or parity error",
382 	"Instruction Tag Cache Bank B ECC or parity error",
383 };
384 
385 static const char * const smca_nbio_mce_desc[] = {
386 	"ECC or Parity error",
387 	"PCIE error",
388 	"SDP ErrEvent error",
389 	"SDP Egress Poison Error",
390 	"IOHC Internal Poison Error",
391 };
392 
393 static const char * const smca_pcie_mce_desc[] = {
394 	"CCIX PER Message logging",
395 	"CCIX Read Response with Status: Non-Data Error",
396 	"CCIX Write Response with Status: Non-Data Error",
397 	"CCIX Read Response with Status: Data Error",
398 	"CCIX Non-okay write response with data error",
399 };
400 
401 struct smca_mce_desc {
402 	const char * const *descs;
403 	unsigned int num_descs;
404 };
405 
406 static struct smca_mce_desc smca_mce_descs[] = {
407 	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
408 	[SMCA_LS_V2]	= { smca_ls2_mce_desc,	ARRAY_SIZE(smca_ls2_mce_desc)	},
409 	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
410 	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
411 	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
412 	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
413 	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
414 	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
415 	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
416 	[SMCA_CS_V2]	= { smca_cs2_mce_desc,	ARRAY_SIZE(smca_cs2_mce_desc)	},
417 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
418 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
419 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
420 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
421 	[SMCA_PSP_V2]	= { smca_psp2_mce_desc,	ARRAY_SIZE(smca_psp2_mce_desc)	},
422 	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
423 	[SMCA_SMU_V2]	= { smca_smu2_mce_desc,	ARRAY_SIZE(smca_smu2_mce_desc)	},
424 	[SMCA_MP5]	= { smca_mp5_mce_desc,	ARRAY_SIZE(smca_mp5_mce_desc)	},
425 	[SMCA_NBIO]	= { smca_nbio_mce_desc,	ARRAY_SIZE(smca_nbio_mce_desc)	},
426 	[SMCA_PCIE]	= { smca_pcie_mce_desc,	ARRAY_SIZE(smca_pcie_mce_desc)	},
427 };
428 
429 static bool f12h_mc0_mce(u16 ec, u8 xec)
430 {
431 	bool ret = false;
432 
433 	if (MEM_ERROR(ec)) {
434 		u8 ll = LL(ec);
435 		ret = true;
436 
437 		if (ll == LL_L2)
438 			pr_cont("during L1 linefill from L2.\n");
439 		else if (ll == LL_L1)
440 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
441 		else
442 			ret = false;
443 	}
444 	return ret;
445 }
446 
447 static bool f10h_mc0_mce(u16 ec, u8 xec)
448 {
449 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
450 		pr_cont("during data scrub.\n");
451 		return true;
452 	}
453 	return f12h_mc0_mce(ec, xec);
454 }
455 
456 static bool k8_mc0_mce(u16 ec, u8 xec)
457 {
458 	if (BUS_ERROR(ec)) {
459 		pr_cont("during system linefill.\n");
460 		return true;
461 	}
462 
463 	return f10h_mc0_mce(ec, xec);
464 }
465 
466 static bool cat_mc0_mce(u16 ec, u8 xec)
467 {
468 	u8 r4	 = R4(ec);
469 	bool ret = true;
470 
471 	if (MEM_ERROR(ec)) {
472 
473 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
474 			return false;
475 
476 		switch (r4) {
477 		case R4_DRD:
478 		case R4_DWR:
479 			pr_cont("Data/Tag parity error due to %s.\n",
480 				(r4 == R4_DRD ? "load/hw prf" : "store"));
481 			break;
482 		case R4_EVICT:
483 			pr_cont("Copyback parity error on a tag miss.\n");
484 			break;
485 		case R4_SNOOP:
486 			pr_cont("Tag parity error during snoop.\n");
487 			break;
488 		default:
489 			ret = false;
490 		}
491 	} else if (BUS_ERROR(ec)) {
492 
493 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
494 			return false;
495 
496 		pr_cont("System read data error on a ");
497 
498 		switch (r4) {
499 		case R4_RD:
500 			pr_cont("TLB reload.\n");
501 			break;
502 		case R4_DWR:
503 			pr_cont("store.\n");
504 			break;
505 		case R4_DRD:
506 			pr_cont("load.\n");
507 			break;
508 		default:
509 			ret = false;
510 		}
511 	} else {
512 		ret = false;
513 	}
514 
515 	return ret;
516 }
517 
518 static bool f15h_mc0_mce(u16 ec, u8 xec)
519 {
520 	bool ret = true;
521 
522 	if (MEM_ERROR(ec)) {
523 
524 		switch (xec) {
525 		case 0x0:
526 			pr_cont("Data Array access error.\n");
527 			break;
528 
529 		case 0x1:
530 			pr_cont("UC error during a linefill from L2/NB.\n");
531 			break;
532 
533 		case 0x2:
534 		case 0x11:
535 			pr_cont("STQ access error.\n");
536 			break;
537 
538 		case 0x3:
539 			pr_cont("SCB access error.\n");
540 			break;
541 
542 		case 0x10:
543 			pr_cont("Tag error.\n");
544 			break;
545 
546 		case 0x12:
547 			pr_cont("LDQ access error.\n");
548 			break;
549 
550 		default:
551 			ret = false;
552 		}
553 	} else if (BUS_ERROR(ec)) {
554 
555 		if (!xec)
556 			pr_cont("System Read Data Error.\n");
557 		else
558 			pr_cont(" Internal error condition type %d.\n", xec);
559 	} else if (INT_ERROR(ec)) {
560 		if (xec <= 0x1f)
561 			pr_cont("Hardware Assert.\n");
562 		else
563 			ret = false;
564 
565 	} else
566 		ret = false;
567 
568 	return ret;
569 }
570 
571 static void decode_mc0_mce(struct mce *m)
572 {
573 	u16 ec = EC(m->status);
574 	u8 xec = XEC(m->status, xec_mask);
575 
576 	pr_emerg(HW_ERR "MC0 Error: ");
577 
578 	/* TLB error signatures are the same across families */
579 	if (TLB_ERROR(ec)) {
580 		if (TT(ec) == TT_DATA) {
581 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
582 				((xec == 2) ? "locked miss"
583 					    : (xec ? "multimatch" : "parity")));
584 			return;
585 		}
586 	} else if (fam_ops.mc0_mce(ec, xec))
587 		;
588 	else
589 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
590 }
591 
592 static bool k8_mc1_mce(u16 ec, u8 xec)
593 {
594 	u8 ll	 = LL(ec);
595 	bool ret = true;
596 
597 	if (!MEM_ERROR(ec))
598 		return false;
599 
600 	if (ll == 0x2)
601 		pr_cont("during a linefill from L2.\n");
602 	else if (ll == 0x1) {
603 		switch (R4(ec)) {
604 		case R4_IRD:
605 			pr_cont("Parity error during data load.\n");
606 			break;
607 
608 		case R4_EVICT:
609 			pr_cont("Copyback Parity/Victim error.\n");
610 			break;
611 
612 		case R4_SNOOP:
613 			pr_cont("Tag Snoop error.\n");
614 			break;
615 
616 		default:
617 			ret = false;
618 			break;
619 		}
620 	} else
621 		ret = false;
622 
623 	return ret;
624 }
625 
626 static bool cat_mc1_mce(u16 ec, u8 xec)
627 {
628 	u8 r4    = R4(ec);
629 	bool ret = true;
630 
631 	if (!MEM_ERROR(ec))
632 		return false;
633 
634 	if (TT(ec) != TT_INSTR)
635 		return false;
636 
637 	if (r4 == R4_IRD)
638 		pr_cont("Data/tag array parity error for a tag hit.\n");
639 	else if (r4 == R4_SNOOP)
640 		pr_cont("Tag error during snoop/victimization.\n");
641 	else if (xec == 0x0)
642 		pr_cont("Tag parity error from victim castout.\n");
643 	else if (xec == 0x2)
644 		pr_cont("Microcode patch RAM parity error.\n");
645 	else
646 		ret = false;
647 
648 	return ret;
649 }
650 
651 static bool f15h_mc1_mce(u16 ec, u8 xec)
652 {
653 	bool ret = true;
654 
655 	if (!MEM_ERROR(ec))
656 		return false;
657 
658 	switch (xec) {
659 	case 0x0 ... 0xa:
660 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
661 		break;
662 
663 	case 0xd:
664 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
665 		break;
666 
667 	case 0x10:
668 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
669 		break;
670 
671 	case 0x11 ... 0x15:
672 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
673 		break;
674 
675 	default:
676 		ret = false;
677 	}
678 	return ret;
679 }
680 
681 static void decode_mc1_mce(struct mce *m)
682 {
683 	u16 ec = EC(m->status);
684 	u8 xec = XEC(m->status, xec_mask);
685 
686 	pr_emerg(HW_ERR "MC1 Error: ");
687 
688 	if (TLB_ERROR(ec))
689 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
690 			(xec ? "multimatch" : "parity error"));
691 	else if (BUS_ERROR(ec)) {
692 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
693 
694 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
695 	} else if (INT_ERROR(ec)) {
696 		if (xec <= 0x3f)
697 			pr_cont("Hardware Assert.\n");
698 		else
699 			goto wrong_mc1_mce;
700 	} else if (fam_ops.mc1_mce(ec, xec))
701 		;
702 	else
703 		goto wrong_mc1_mce;
704 
705 	return;
706 
707 wrong_mc1_mce:
708 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
709 }
710 
711 static bool k8_mc2_mce(u16 ec, u8 xec)
712 {
713 	bool ret = true;
714 
715 	if (xec == 0x1)
716 		pr_cont(" in the write data buffers.\n");
717 	else if (xec == 0x3)
718 		pr_cont(" in the victim data buffers.\n");
719 	else if (xec == 0x2 && MEM_ERROR(ec))
720 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
721 	else if (xec == 0x0) {
722 		if (TLB_ERROR(ec))
723 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
724 				TT_MSG(ec));
725 		else if (BUS_ERROR(ec))
726 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
727 				R4_MSG(ec), PP_MSG(ec));
728 		else if (MEM_ERROR(ec)) {
729 			u8 r4 = R4(ec);
730 
731 			if (r4 >= 0x7)
732 				pr_cont(": %s error during data copyback.\n",
733 					R4_MSG(ec));
734 			else if (r4 <= 0x1)
735 				pr_cont(": %s parity/ECC error during data "
736 					"access from L2.\n", R4_MSG(ec));
737 			else
738 				ret = false;
739 		} else
740 			ret = false;
741 	} else
742 		ret = false;
743 
744 	return ret;
745 }
746 
747 static bool f15h_mc2_mce(u16 ec, u8 xec)
748 {
749 	bool ret = true;
750 
751 	if (TLB_ERROR(ec)) {
752 		if (xec == 0x0)
753 			pr_cont("Data parity TLB read error.\n");
754 		else if (xec == 0x1)
755 			pr_cont("Poison data provided for TLB fill.\n");
756 		else
757 			ret = false;
758 	} else if (BUS_ERROR(ec)) {
759 		if (xec > 2)
760 			ret = false;
761 
762 		pr_cont("Error during attempted NB data read.\n");
763 	} else if (MEM_ERROR(ec)) {
764 		switch (xec) {
765 		case 0x4 ... 0xc:
766 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
767 			break;
768 
769 		case 0x10 ... 0x14:
770 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
771 			break;
772 
773 		default:
774 			ret = false;
775 		}
776 	} else if (INT_ERROR(ec)) {
777 		if (xec <= 0x3f)
778 			pr_cont("Hardware Assert.\n");
779 		else
780 			ret = false;
781 	}
782 
783 	return ret;
784 }
785 
786 static bool f16h_mc2_mce(u16 ec, u8 xec)
787 {
788 	u8 r4 = R4(ec);
789 
790 	if (!MEM_ERROR(ec))
791 		return false;
792 
793 	switch (xec) {
794 	case 0x04 ... 0x05:
795 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
796 		break;
797 
798 	case 0x09 ... 0x0b:
799 	case 0x0d ... 0x0f:
800 		pr_cont("ECC error in L2 tag (%s).\n",
801 			((r4 == R4_GEN)   ? "BankReq" :
802 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
803 		break;
804 
805 	case 0x10 ... 0x19:
806 	case 0x1b:
807 		pr_cont("ECC error in L2 data array (%s).\n",
808 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
809 			((r4 == R4_GEN)   ? "Attr" :
810 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
811 		break;
812 
813 	case 0x1c ... 0x1d:
814 	case 0x1f:
815 		pr_cont("Parity error in L2 attribute bits (%s).\n",
816 			((r4 == R4_RD)  ? "Hit"  :
817 			((r4 == R4_GEN) ? "Attr" : "Fill")));
818 		break;
819 
820 	default:
821 		return false;
822 	}
823 
824 	return true;
825 }
826 
827 static void decode_mc2_mce(struct mce *m)
828 {
829 	u16 ec = EC(m->status);
830 	u8 xec = XEC(m->status, xec_mask);
831 
832 	pr_emerg(HW_ERR "MC2 Error: ");
833 
834 	if (!fam_ops.mc2_mce(ec, xec))
835 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
836 }
837 
838 static void decode_mc3_mce(struct mce *m)
839 {
840 	u16 ec = EC(m->status);
841 	u8 xec = XEC(m->status, xec_mask);
842 
843 	if (boot_cpu_data.x86 >= 0x14) {
844 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
845 			 " please report on LKML.\n");
846 		return;
847 	}
848 
849 	pr_emerg(HW_ERR "MC3 Error");
850 
851 	if (xec == 0x0) {
852 		u8 r4 = R4(ec);
853 
854 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
855 			goto wrong_mc3_mce;
856 
857 		pr_cont(" during %s.\n", R4_MSG(ec));
858 	} else
859 		goto wrong_mc3_mce;
860 
861 	return;
862 
863  wrong_mc3_mce:
864 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
865 }
866 
867 static void decode_mc4_mce(struct mce *m)
868 {
869 	unsigned int fam = x86_family(m->cpuid);
870 	int node_id = amd_get_nb_id(m->extcpu);
871 	u16 ec = EC(m->status);
872 	u8 xec = XEC(m->status, 0x1f);
873 	u8 offset = 0;
874 
875 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
876 
877 	switch (xec) {
878 	case 0x0 ... 0xe:
879 
880 		/* special handling for DRAM ECCs */
881 		if (xec == 0x0 || xec == 0x8) {
882 			/* no ECCs on F11h */
883 			if (fam == 0x11)
884 				goto wrong_mc4_mce;
885 
886 			pr_cont("%s.\n", mc4_mce_desc[xec]);
887 
888 			if (decode_dram_ecc)
889 				decode_dram_ecc(node_id, m);
890 			return;
891 		}
892 		break;
893 
894 	case 0xf:
895 		if (TLB_ERROR(ec))
896 			pr_cont("GART Table Walk data error.\n");
897 		else if (BUS_ERROR(ec))
898 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
899 		else
900 			goto wrong_mc4_mce;
901 		return;
902 
903 	case 0x19:
904 		if (fam == 0x15 || fam == 0x16)
905 			pr_cont("Compute Unit Data Error.\n");
906 		else
907 			goto wrong_mc4_mce;
908 		return;
909 
910 	case 0x1c ... 0x1f:
911 		offset = 13;
912 		break;
913 
914 	default:
915 		goto wrong_mc4_mce;
916 	}
917 
918 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
919 	return;
920 
921  wrong_mc4_mce:
922 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
923 }
924 
925 static void decode_mc5_mce(struct mce *m)
926 {
927 	unsigned int fam = x86_family(m->cpuid);
928 	u16 ec = EC(m->status);
929 	u8 xec = XEC(m->status, xec_mask);
930 
931 	if (fam == 0xf || fam == 0x11)
932 		goto wrong_mc5_mce;
933 
934 	pr_emerg(HW_ERR "MC5 Error: ");
935 
936 	if (INT_ERROR(ec)) {
937 		if (xec <= 0x1f) {
938 			pr_cont("Hardware Assert.\n");
939 			return;
940 		} else
941 			goto wrong_mc5_mce;
942 	}
943 
944 	if (xec == 0x0 || xec == 0xc)
945 		pr_cont("%s.\n", mc5_mce_desc[xec]);
946 	else if (xec <= 0xd)
947 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
948 	else
949 		goto wrong_mc5_mce;
950 
951 	return;
952 
953  wrong_mc5_mce:
954 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
955 }
956 
957 static void decode_mc6_mce(struct mce *m)
958 {
959 	u8 xec = XEC(m->status, xec_mask);
960 
961 	pr_emerg(HW_ERR "MC6 Error: ");
962 
963 	if (xec > 0x5)
964 		goto wrong_mc6_mce;
965 
966 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
967 	return;
968 
969  wrong_mc6_mce:
970 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
971 }
972 
973 /* Decode errors according to Scalable MCA specification */
974 static void decode_smca_error(struct mce *m)
975 {
976 	struct smca_hwid *hwid;
977 	enum smca_bank_types bank_type;
978 	const char *ip_name;
979 	u8 xec = XEC(m->status, xec_mask);
980 
981 	if (m->bank >= ARRAY_SIZE(smca_banks))
982 		return;
983 
984 	hwid = smca_banks[m->bank].hwid;
985 	if (!hwid)
986 		return;
987 
988 	bank_type = hwid->bank_type;
989 
990 	if (bank_type == SMCA_RESERVED) {
991 		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
992 		return;
993 	}
994 
995 	ip_name = smca_get_long_name(bank_type);
996 
997 	pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
998 
999 	/* Only print the decode of valid error codes */
1000 	if (xec < smca_mce_descs[bank_type].num_descs &&
1001 			(hwid->xec_bitmap & BIT_ULL(xec))) {
1002 		pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
1003 	}
1004 
1005 	if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
1006 		decode_dram_ecc(cpu_to_node(m->extcpu), m);
1007 }
1008 
1009 static inline void amd_decode_err_code(u16 ec)
1010 {
1011 	if (INT_ERROR(ec)) {
1012 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
1013 		return;
1014 	}
1015 
1016 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
1017 
1018 	if (BUS_ERROR(ec))
1019 		pr_cont(", mem/io: %s", II_MSG(ec));
1020 	else
1021 		pr_cont(", tx: %s", TT_MSG(ec));
1022 
1023 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
1024 		pr_cont(", mem-tx: %s", R4_MSG(ec));
1025 
1026 		if (BUS_ERROR(ec))
1027 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1028 	}
1029 
1030 	pr_cont("\n");
1031 }
1032 
1033 /*
1034  * Filter out unwanted MCE signatures here.
1035  */
1036 static bool ignore_mce(struct mce *m)
1037 {
1038 	/*
1039 	 * NB GART TLB error reporting is disabled by default.
1040 	 */
1041 	if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
1042 		return true;
1043 
1044 	return false;
1045 }
1046 
1047 static const char *decode_error_status(struct mce *m)
1048 {
1049 	if (m->status & MCI_STATUS_UC) {
1050 		if (m->status & MCI_STATUS_PCC)
1051 			return "System Fatal error.";
1052 		if (m->mcgstatus & MCG_STATUS_RIPV)
1053 			return "Uncorrected, software restartable error.";
1054 		return "Uncorrected, software containable error.";
1055 	}
1056 
1057 	if (m->status & MCI_STATUS_DEFERRED)
1058 		return "Deferred error, no action required.";
1059 
1060 	return "Corrected error, no action required.";
1061 }
1062 
1063 static int
1064 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1065 {
1066 	struct mce *m = (struct mce *)data;
1067 	unsigned int fam = x86_family(m->cpuid);
1068 	int ecc;
1069 
1070 	if (ignore_mce(m))
1071 		return NOTIFY_STOP;
1072 
1073 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1074 
1075 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1076 		m->extcpu,
1077 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
1078 		m->bank,
1079 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
1080 		((m->status & MCI_STATUS_UC)	? "UE"	  :
1081 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
1082 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
1083 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
1084 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
1085 
1086 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1087 		u32 low, high;
1088 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1089 
1090 		if (!rdmsr_safe(addr, &low, &high) &&
1091 		    (low & MCI_CONFIG_MCAX))
1092 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1093 
1094 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1095 	}
1096 
1097 	/* do the two bits[14:13] together */
1098 	ecc = (m->status >> 45) & 0x3;
1099 	if (ecc)
1100 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1101 
1102 	if (fam >= 0x15) {
1103 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1104 
1105 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
1106 		if (fam != 0x15 || m->bank != 4)
1107 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1108 	}
1109 
1110 	if (fam >= 0x17)
1111 		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1112 
1113 	pr_cont("]: 0x%016llx\n", m->status);
1114 
1115 	if (m->status & MCI_STATUS_ADDRV)
1116 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
1117 
1118 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1119 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
1120 
1121 		if (m->status & MCI_STATUS_SYNDV)
1122 			pr_cont(", Syndrome: 0x%016llx", m->synd);
1123 
1124 		pr_cont("\n");
1125 
1126 		decode_smca_error(m);
1127 		goto err_code;
1128 	}
1129 
1130 	if (m->tsc)
1131 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1132 
1133 	/* Doesn't matter which member to test. */
1134 	if (!fam_ops.mc0_mce)
1135 		goto err_code;
1136 
1137 	switch (m->bank) {
1138 	case 0:
1139 		decode_mc0_mce(m);
1140 		break;
1141 
1142 	case 1:
1143 		decode_mc1_mce(m);
1144 		break;
1145 
1146 	case 2:
1147 		decode_mc2_mce(m);
1148 		break;
1149 
1150 	case 3:
1151 		decode_mc3_mce(m);
1152 		break;
1153 
1154 	case 4:
1155 		decode_mc4_mce(m);
1156 		break;
1157 
1158 	case 5:
1159 		decode_mc5_mce(m);
1160 		break;
1161 
1162 	case 6:
1163 		decode_mc6_mce(m);
1164 		break;
1165 
1166 	default:
1167 		break;
1168 	}
1169 
1170  err_code:
1171 	amd_decode_err_code(m->status & 0xffff);
1172 
1173 	return NOTIFY_STOP;
1174 }
1175 
1176 static struct notifier_block amd_mce_dec_nb = {
1177 	.notifier_call	= amd_decode_mce,
1178 	.priority	= MCE_PRIO_EDAC,
1179 };
1180 
1181 static int __init mce_amd_init(void)
1182 {
1183 	struct cpuinfo_x86 *c = &boot_cpu_data;
1184 
1185 	if (c->x86_vendor != X86_VENDOR_AMD &&
1186 	    c->x86_vendor != X86_VENDOR_HYGON)
1187 		return -ENODEV;
1188 
1189 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1190 		xec_mask = 0x3f;
1191 		goto out;
1192 	}
1193 
1194 	switch (c->x86) {
1195 	case 0xf:
1196 		fam_ops.mc0_mce = k8_mc0_mce;
1197 		fam_ops.mc1_mce = k8_mc1_mce;
1198 		fam_ops.mc2_mce = k8_mc2_mce;
1199 		break;
1200 
1201 	case 0x10:
1202 		fam_ops.mc0_mce = f10h_mc0_mce;
1203 		fam_ops.mc1_mce = k8_mc1_mce;
1204 		fam_ops.mc2_mce = k8_mc2_mce;
1205 		break;
1206 
1207 	case 0x11:
1208 		fam_ops.mc0_mce = k8_mc0_mce;
1209 		fam_ops.mc1_mce = k8_mc1_mce;
1210 		fam_ops.mc2_mce = k8_mc2_mce;
1211 		break;
1212 
1213 	case 0x12:
1214 		fam_ops.mc0_mce = f12h_mc0_mce;
1215 		fam_ops.mc1_mce = k8_mc1_mce;
1216 		fam_ops.mc2_mce = k8_mc2_mce;
1217 		break;
1218 
1219 	case 0x14:
1220 		fam_ops.mc0_mce = cat_mc0_mce;
1221 		fam_ops.mc1_mce = cat_mc1_mce;
1222 		fam_ops.mc2_mce = k8_mc2_mce;
1223 		break;
1224 
1225 	case 0x15:
1226 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1227 
1228 		fam_ops.mc0_mce = f15h_mc0_mce;
1229 		fam_ops.mc1_mce = f15h_mc1_mce;
1230 		fam_ops.mc2_mce = f15h_mc2_mce;
1231 		break;
1232 
1233 	case 0x16:
1234 		xec_mask = 0x1f;
1235 		fam_ops.mc0_mce = cat_mc0_mce;
1236 		fam_ops.mc1_mce = cat_mc1_mce;
1237 		fam_ops.mc2_mce = f16h_mc2_mce;
1238 		break;
1239 
1240 	case 0x17:
1241 	case 0x18:
1242 		pr_warn("Decoding supported only on Scalable MCA processors.\n");
1243 		return -EINVAL;
1244 
1245 	default:
1246 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1247 		return -EINVAL;
1248 	}
1249 
1250 out:
1251 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1252 
1253 	mce_register_decode_chain(&amd_mce_dec_nb);
1254 
1255 	return 0;
1256 }
1257 early_initcall(mce_amd_init);
1258 
1259 #ifdef MODULE
1260 static void __exit mce_amd_exit(void)
1261 {
1262 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1263 }
1264 
1265 MODULE_DESCRIPTION("AMD MCE decoder");
1266 MODULE_ALIAS("edac-mce-amd");
1267 MODULE_LICENSE("GPL");
1268 module_exit(mce_amd_exit);
1269 #endif
1270