1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * AMD Athlon64/Opteron CPU Module Machine-Check Poller
31  *
32  * The AMD Opteron processor doesn't yet report correctable errors via #mc's.
33  * Instead, it fixes the problem, silently updates the error state MSRs, and
34  * resumes operation.  In order to discover occurrances of correctable errors,
35  * we have to poll in the background using the omni cyclics mechanism.  The
36  * error injector also has the ability to manually request an immediate poll.
37  * Locking is fairly simple within the poller: the per-CPU mutex
38  * ao->ao_mca.ao_mca_poll_lock ensures that only one poll request is active.
39  */
40 
41 #include <sys/types.h>
42 #include <sys/sysmacros.h>
43 #include <sys/x86_archext.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/sdt.h>
48 
49 #include "ao.h"
50 
51 static uint_t ao_mca_poll_trace_nent = 100;
52 #ifdef DEBUG
53 static uint_t ao_mca_poll_trace_always = 1;
54 #else
55 static uint_t ao_mca_poll_trace_always = 0;
56 #endif
57 
58 static cyclic_id_t ao_mca_poll_cycid;
59 static hrtime_t ao_mca_poll_interval = NANOSEC * 10ULL;
60 
61 static void
62 ao_mca_poll_trace(ao_mca_t *mca, uint32_t what, uint32_t nerr)
63 {
64 	uint_t next;
65 	ao_mca_poll_trace_t *pt;
66 
67 	ASSERT(MUTEX_HELD(&mca->ao_mca_poll_lock));
68 	DTRACE_PROBE2(ao__poll__trace, uint32_t, what, uint32_t, nerr);
69 
70 	if (mca->ao_mca_poll_trace == NULL)
71 		return; /* poll trace buffer is disabled */
72 
73 	next = (mca->ao_mca_poll_curtrace + 1) % ao_mca_poll_trace_nent;
74 	pt = &mca->ao_mca_poll_trace[next];
75 
76 	pt->mpt_when = 0;
77 	pt->mpt_what = what;
78 
79 	if (what == AO_MPT_WHAT_CYC_ERR)
80 		pt->mpt_nerr = MIN(nerr, UINT8_MAX);
81 
82 	pt->mpt_when = gethrtime();
83 	mca->ao_mca_poll_curtrace = next;
84 }
85 
86 static void
87 ao_mca_poll_common(ao_mca_t *mca, int what)
88 {
89 	ao_cpu_logout_t *acl = &mca->ao_mca_logout[AO_MCA_LOGOUT_POLLER];
90 	int i, n, fatal;
91 
92 	if (mca->ao_mca_flags & AO_MCA_F_UNFAULTING) {
93 		mca->ao_mca_flags &= ~AO_MCA_F_UNFAULTING;
94 		ao_mca_poll_trace(mca, AO_MPT_WHAT_UNFAULTING, 0);
95 
96 		/*
97 		 * On the first poll after re-enabling a faulty CPU we clear
98 		 * the status registers; see ao_faulted_exit() for more info.
99 		 */
100 		if (what == AO_MPT_WHAT_CYC_ERR) {
101 			for (i = 0; i < AMD_MCA_BANK_COUNT; i++)
102 				wrmsr(ao_bank_regs[i].abr_status, 0);
103 			return;
104 		}
105 	}
106 
107 	fatal = ao_mca_logout(acl, NULL, &n);
108 	ao_mca_poll_trace(mca, what, n);
109 
110 	if (fatal && cmi_panic_on_uncorrectable_error)
111 		fm_panic("Unrecoverable Machine-Check Error (polled)");
112 }
113 
114 static void
115 ao_mca_poll_cyclic(void *arg)
116 {
117 	ao_data_t *ao = arg;
118 
119 	if (ao != NULL && mutex_tryenter(&ao->ao_mca.ao_mca_poll_lock)) {
120 		ao_mca_poll_common(&ao->ao_mca, AO_MPT_WHAT_CYC_ERR);
121 		mutex_exit(&ao->ao_mca.ao_mca_poll_lock);
122 	}
123 }
124 
125 void
126 ao_mca_poke(void *arg)
127 {
128 	ao_data_t *ao = arg;
129 
130 	mutex_enter(&ao->ao_mca.ao_mca_poll_lock);
131 	ao_mca_poll_common(&ao->ao_mca, AO_MPT_WHAT_POKE_ERR);
132 	mutex_exit(&ao->ao_mca.ao_mca_poll_lock);
133 }
134 
135 /*ARGSUSED*/
136 static void
137 ao_mca_poll_online(void *arg, cpu_t *cpu, cyc_handler_t *cyh, cyc_time_t *cyt)
138 {
139 	cyt->cyt_when = 0;
140 	cyh->cyh_level = CY_LOW_LEVEL;
141 
142 	/*
143 	 * If the CPU coming on-line isn't supported by this CPU module, then
144 	 * disable the cylic by cranking cyt_interval and setting arg to NULL.
145 	 */
146 	if (cpu->cpu_m.mcpu_cmi != NULL &&
147 	    cpu->cpu_m.mcpu_cmi->cmi_ops != &_cmi_ops) {
148 		cyt->cyt_interval = INT64_MAX;
149 		cyh->cyh_func = ao_mca_poll_cyclic;
150 		cyh->cyh_arg = NULL;
151 	} else {
152 		cyt->cyt_interval = ao_mca_poll_interval;
153 		cyh->cyh_func = ao_mca_poll_cyclic;
154 		cyh->cyh_arg = cpu->cpu_m.mcpu_cmidata;
155 	}
156 }
157 
158 /*ARGSUSED*/
159 static void
160 ao_mca_poll_offline(void *arg, cpu_t *cpu, void *cyh_arg)
161 {
162 	/* nothing to do here */
163 }
164 
165 void
166 ao_mca_poll_init(ao_mca_t *mca)
167 {
168 	mutex_init(&mca->ao_mca_poll_lock, NULL, MUTEX_DRIVER, NULL);
169 
170 	if (ao_mca_poll_trace_always) {
171 		mca->ao_mca_poll_trace =
172 		    kmem_zalloc(sizeof (ao_mca_poll_trace_t) *
173 		    ao_mca_poll_trace_nent, KM_SLEEP);
174 		mca->ao_mca_poll_curtrace = 0;
175 	}
176 }
177 
178 void
179 ao_mca_poll_start(void)
180 {
181 	cyc_omni_handler_t cyo;
182 
183 	if (ao_mca_poll_interval == 0)
184 		return; /* if manually tuned to zero, disable polling */
185 
186 	cyo.cyo_online = ao_mca_poll_online;
187 	cyo.cyo_offline = ao_mca_poll_offline;
188 	cyo.cyo_arg = NULL;
189 
190 	mutex_enter(&cpu_lock);
191 	ao_mca_poll_cycid = cyclic_add_omni(&cyo);
192 	mutex_exit(&cpu_lock);
193 }
194