1 /* Copyright (C) 2008 Intel Corporation
2    Decode Intel Nehalem specific machine check errors.
3 
4    mcelog is free software; you can redistribute it and/or
5    modify it under the terms of the GNU General Public
6    License as published by the Free Software Foundation; version
7    2.
8 
9    mcelog is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12    General Public License for more details.
13 
14    You should find a copy of v2 of the GNU General Public License somewhere
15    on your Linux system; if not, write to the Free Software Foundation,
16    Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 
18    Author: Andi Kleen
19 */
20 
21 #include <string.h>
22 #include <stdio.h>
23 #include "mcelog.h"
24 #include "nehalem.h"
25 #include "bitfield.h"
26 #include "memdb.h"
27 
28 /* See IA32 SDM Vol3B Appendix E.3.2 ff */
29 
30 /* MC1_STATUS error */
31 static struct field qpi_status[] = {
32 	SBITFIELD(16, "QPI header had bad parity"),
33 	SBITFIELD(17, "QPI Data packet had bad parity"),
34 	SBITFIELD(18, "Number of QPI retries exceeded"),
35 	SBITFIELD(19, "Received QPI data packet that was poisoned by sender"),
36 	SBITFIELD(20, "QPI reserved 20"),
37 	SBITFIELD(21, "QPI reserved 21"),
38 	SBITFIELD(22, "QPI received unsupported message encoding"),
39 	SBITFIELD(23, "QPI credit type is not supported"),
40 	SBITFIELD(24, "Sender sent too many QPI flits to the receiver"),
41 	SBITFIELD(25, "QPI Sender sent a failed response to receiver"),
42 	SBITFIELD(26, "Clock jitter detected in internal QPI clocking"),
43 	{}
44 };
45 
46 static struct field qpi_misc[] = {
47 	SBITFIELD(14, "QPI misc reserved 14"),
48 	SBITFIELD(15, "QPI misc reserved 15"),
49 	SBITFIELD(24, "QPI Interleave/Head Indication Bit (IIB)"),
50 	{}
51 };
52 
53 static struct numfield qpi_numbers[] = {
54 	HEXNUMBER(0, 7, "QPI class and opcode of packet with error"),
55 	HEXNUMBER(8, 13, "QPI Request Transaction ID"),
56 	NUMBERFORCE(16, 18, "QPI Requestor/Home Node ID (RHNID)"),
57 	HEXNUMBER(19, 23, "QPI miscreserved 19-23"),
58 	{},
59 };
60 
61 static struct field nhm_memory_status[] = {
62 	SBITFIELD(16, "Memory read ECC error"),
63 	SBITFIELD(17, "Memory ECC error occurred during scrub"),
64 	SBITFIELD(18, "Memory write parity error"),
65 	SBITFIELD(19, "Memory error in half of redundant memory"),
66 	SBITFIELD(20, "Memory reserved 20"),
67 	SBITFIELD(21, "Memory access out of range"),
68 	SBITFIELD(22, "Memory internal RTID invalid"),
69 	SBITFIELD(23, "Memory address parity error"),
70 	SBITFIELD(24, "Memory byte enable parity error"),
71 	{}
72 };
73 
74 static struct numfield nhm_memory_status_numbers[] = {
75 	HEXNUMBER(25, 37, "Memory MISC reserved 25..37"),
76 	NUMBERFORCE(38, 52, "Memory corrected error count (CORE_ERR_CNT)"),
77 	HEXNUMBER(53, 56, "Memory MISC reserved 53..56"),
78 	{}
79 };
80 
81 static struct numfield nhm_memory_misc_numbers[] = {
82 	HEXNUMBERFORCE(0, 7, "Memory transaction Tracker ID (RTId)"),
83 	NUMBERFORCE(16, 17, "Memory DIMM ID of error"),
84 	NUMBERFORCE(18, 19, "Memory channel ID of error"),
85 	HEXNUMBERFORCE(32, 63, "Memory ECC syndrome"),
86 	{}
87 };
88 
89 static char *internal_errors[] = {
90 	[0x0]  = "No Error",
91 	[0x3]  = "Reset firmware did not complete",
92 	[0x8]  = "Received an invalid CMPD",
93 	[0xa]  = "Invalid Power Management Request",
94 	[0xd]  = "Invalid S-state transition",
95 	[0x11] = "VID controller does not match POC controller selected",
96 	[0x1a] = "MSID from POC does not match CPU MSID",
97 };
98 
99 static struct field internal_error_status[] = {
100 	FIELD(24, internal_errors),
101 	{}
102 };
103 
104 static struct numfield internal_error_numbers[] = {
105 	HEXNUMBER(16, 23, "Internal machine check status reserved 16..23"),
106 	HEXNUMBER(32, 56, "Internal machine check status reserved 32..56"),
107 	{},
108 };
109 
110 /* Generic architectural memory controller encoding */
111 
112 static char *mmm_mnemonic[] = {
113 	"GEN", "RD", "WR", "AC", "MS", "RES5", "RES6", "RES7"
114 };
115 static char *mmm_desc[] = {
116 	"Generic undefined request",
117 	"Memory read error",
118 	"Memory write error",
119 	"Address/Command error",
120 	"Memory scrubbing error",
121 	"Reserved 5",
122 	"Reserved 6",
123 	"Reserved 7"
124 };
125 
decode_memory_controller(u32 status,u8 bank)126 void decode_memory_controller(u32 status, u8 bank)
127 {
128 	char channel[30];
129 	if ((status & 0xf) == 0xf)
130 		strcpy(channel, "unspecified");
131 	else {
132         /* Fix for Knights Landing/Mill MIC */
133 		if (cputype == CPU_KNIGHTS_LANDING || cputype == CPU_KNIGHTS_MILL)
134 			sprintf(channel, "%u", (status & 0xf) + 3 * (bank == 15));
135 		else
136 			sprintf(channel, "%u", status & 0xf);
137 	}
138 	Wprintf("MEMORY CONTROLLER %s_CHANNEL%s_ERR\n",
139 		mmm_mnemonic[(status >> 4) & 7],
140 		channel);
141 	Wprintf("Transaction: %s\n", mmm_desc[(status >> 4) & 7]);
142 }
143 
nehalem_decode_model(u64 status,u64 misc)144 void nehalem_decode_model(u64 status, u64 misc)
145 {
146 	u32 mca = status & 0xffff;
147 	if ((mca >> 11) == 1) { 	/* bus and interconnect QPI */
148 		decode_bitfield(status, qpi_status);
149 		if (status & MCI_STATUS_MISCV) {
150 			decode_numfield(misc, qpi_numbers);
151 			decode_bitfield(misc, qpi_misc);
152 		}
153 	} else if (mca == 0x0001) { /* internal unspecified */
154 		decode_bitfield(status, internal_error_status);
155 		decode_numfield(status, internal_error_numbers);
156 	} else if ((mca >> 7) == 1) { /* memory controller */
157 		decode_bitfield(status, nhm_memory_status);
158 		decode_numfield(status, nhm_memory_status_numbers);
159 		if (status & MCI_STATUS_MISCV)
160 			decode_numfield(misc, nhm_memory_misc_numbers);
161 	}
162 }
163 
164 /* Only core errors supported. Same as Nehalem */
xeon75xx_decode_model(struct mce * m,unsigned msize)165 void xeon75xx_decode_model(struct mce *m, unsigned msize)
166 {
167 	u64 status = m->status;
168 	u32 mca = status & 0xffff;
169 	if (mca == 0x0001) { /* internal unspecified */
170 		decode_bitfield(status, internal_error_status);
171 		decode_numfield(status, internal_error_numbers);
172 	}
173 }
174 
175 /* Nehalem-EP specific DIMM decoding */
nehalem_memerr_misc(struct mce * m,int * channel,int * dimm)176 void nehalem_memerr_misc(struct mce *m, int *channel, int *dimm)
177 {
178 	if (m->status & MCI_STATUS_MISCV) {
179 		*channel = EXTRACT(m->misc, 18, 19);
180 		*dimm = EXTRACT(m->misc, 16, 17);
181 	}
182 }
183 
184