1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Page retirement can be an extended process due to the fact that a retirement
29  * may not be possible when the original request is made.  The kernel will
30  * repeatedly attempt to retire a given page, but will not let us know when the
31  * page has been retired.  We therefore have to poll to see if the retirement
32  * has been completed.  This poll is implemented with a bounded exponential
33  * backoff to reduce the burden which we impose upon the system.
34  *
35  * To reduce the burden on fmd in the face of retirement storms, we schedule
36  * all retries as a group.  In the simplest case, we attempt to retire a single
37  * page.  When forced to retry, we initially schedule a retry at a configurable
38  * interval t.  If the retry fails, we schedule another at 2 * t, and so on,
39  * until t reaches the maximum interval (also configurable).  Future retries
40  * for that page will occur with t equal to the maximum interval value.  We
41  * will never give up on a retirement.
42  *
43  * With multiple retirements, the situation gets slightly more complicated.  As
44  * indicated above, we schedule retries as a group.  We don't want to deny new
45  * pages their short retry intervals, so we'll (re)set the retry interval to the
46  * value appropriate for the newest page.
47  */
48 
49 #include <cma.h>
50 
51 #include <time.h>
52 #include <errno.h>
53 #include <unistd.h>
54 #include <strings.h>
55 #include <fm/fmd_api.h>
56 #include <fm/libtopo.h>
57 #include <fm/fmd_fmri.h>
58 #include <fm/fmd_agent.h>
59 #include <sys/fm/protocol.h>
60 
61 static void
62 cma_page_free(fmd_hdl_t *hdl, cma_page_t *page)
63 {
64 	if (page->pg_fmri != NULL)
65 		nvlist_free(page->pg_fmri);
66 	fmd_hdl_free(hdl, page, sizeof (cma_page_t));
67 }
68 
69 /*
70  * Retire the specified ASRU, referring to a memory page by PA or by DIMM
71  * offset (i.e. the encoded coordinates internal bank, row, and column).
72  * In the initial FMA implementation, fault.memory.page exported an ASRU
73  * with an explicit physical address, which is valid at the initial time of
74  * diagnosis but may not be later following DR, DIMM removal, or interleave
75  * changes.  On SPARC, this issue was solved by exporting the DIMM offset
76  * and pushing the entire FMRI to the platform memory controller through
77  * /dev/fm so it can derive the current PA from the DIMM and offset.
78  * On x86, we also encode DIMM and offset in hc-specific, which is then used
79  * by the x64 memory controller driver.
80  * At some point these three approaches need to be rationalized: all platforms
81  * should use the same scheme, either with decoding in the kernel or decoding
82  * in userland (i.e. with a libtopo method to compute and update the PA).
83  */
84 /*ARGSUSED*/
85 int
86 cma_page_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
87     const char *uuid, boolean_t repair)
88 {
89 	cma_page_t *page;
90 	uint64_t pageaddr;
91 	nvlist_t *fmri = NULL;
92 	const char *action = repair ? "unretire" : "retire";
93 	int rc;
94 #ifdef i386
95 	nvlist_t *rsrc, *hcsp;
96 
97 	/*
98 	 * On x86, retire is done by resource
99 	 */
100 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0) {
101 		fmd_hdl_debug(hdl, "page retire resource lookup failed\n");
102 		cma_stats.bad_flts.fmds_value.ui64++;
103 		return (CMA_RA_FAILURE);
104 	}
105 	if (nvlist_dup(rsrc, &fmri, 0) != 0) {
106 		fmd_hdl_debug(hdl, "page retire nvlist dup failed\n");
107 		return (CMA_RA_FAILURE);
108 	}
109 #else /* i386 */
110 	if (nvlist_dup(asru, &fmri, 0) != 0) {
111 		fmd_hdl_debug(hdl, "page retire nvlist dup failed\n");
112 		return (CMA_RA_FAILURE);
113 	}
114 
115 	/* It should already be expanded, but we'll do it again anyway */
116 	if (fmd_nvl_fmri_expand(hdl, fmri) < 0) {
117 		fmd_hdl_debug(hdl, "failed to expand page asru\n");
118 		cma_stats.bad_flts.fmds_value.ui64++;
119 		nvlist_free(fmri);
120 		return (CMA_RA_FAILURE);
121 	}
122 #endif /* i386 */
123 
124 	if (!repair && !fmd_nvl_fmri_present(hdl, fmri)) {
125 		fmd_hdl_debug(hdl, "page retire overtaken by events\n");
126 		cma_stats.page_nonent.fmds_value.ui64++;
127 		nvlist_free(fmri);
128 		return (CMA_RA_SUCCESS);
129 	}
130 
131 #ifdef i386
132 	if (nvlist_lookup_nvlist(fmri, FM_FMRI_HC_SPECIFIC, &hcsp) != 0 ||
133 	    (nvlist_lookup_uint64(hcsp, "asru-" FM_FMRI_HC_SPECIFIC_PHYSADDR,
134 	    &pageaddr) != 0 && nvlist_lookup_uint64(hcsp,
135 	    FM_FMRI_HC_SPECIFIC_PHYSADDR, &pageaddr) != 0)) {
136 #else
137 	if (nvlist_lookup_uint64(fmri, FM_FMRI_MEM_PHYSADDR, &pageaddr)
138 	    != 0) {
139 #endif
140 		fmd_hdl_debug(hdl, "mem fault missing 'physaddr'\n");
141 		cma_stats.bad_flts.fmds_value.ui64++;
142 		nvlist_free(fmri);
143 		return (CMA_RA_FAILURE);
144 	}
145 
146 	if (repair) {
147 		if (!cma.cma_page_dounretire) {
148 			fmd_hdl_debug(hdl, "suppressed unretire of page %llx\n",
149 			    (u_longlong_t)pageaddr);
150 			cma_stats.page_supp.fmds_value.ui64++;
151 			nvlist_free(fmri);
152 			return (CMA_RA_SUCCESS);
153 		}
154 	} else {
155 		if (!cma.cma_page_doretire) {
156 			fmd_hdl_debug(hdl, "suppressed retire of page %llx\n",
157 			    (u_longlong_t)pageaddr);
158 			cma_stats.page_supp.fmds_value.ui64++;
159 			nvlist_free(fmri);
160 			return (CMA_RA_FAILURE);
161 		}
162 	}
163 
164 	if (repair)
165 		rc = cma_fmri_page_unretire(hdl, fmri);
166 	else
167 		rc = cma_fmri_page_retire(hdl, fmri);
168 	if (rc == FMD_AGENT_RETIRE_DONE) {
169 		fmd_hdl_debug(hdl, "%sd page 0x%llx\n",
170 		    action, (u_longlong_t)pageaddr);
171 		if (repair)
172 			cma_stats.page_repairs.fmds_value.ui64++;
173 		else
174 			cma_stats.page_flts.fmds_value.ui64++;
175 		nvlist_free(fmri);
176 		return (CMA_RA_SUCCESS);
177 	} else if (repair || rc != FMD_AGENT_RETIRE_ASYNC) {
178 		fmd_hdl_debug(hdl, "%s of page 0x%llx failed, will not "
179 		    "retry: %s\n", action, (u_longlong_t)pageaddr,
180 		    strerror(errno));
181 
182 		cma_stats.page_fails.fmds_value.ui64++;
183 
184 		nvlist_free(fmri);
185 		return (CMA_RA_FAILURE);
186 	}
187 
188 	/*
189 	 * The page didn't immediately retire.  We'll need to periodically
190 	 * check to see if it has been retired.
191 	 */
192 	fmd_hdl_debug(hdl, "page didn't retire - sleeping\n");
193 
194 	page = fmd_hdl_zalloc(hdl, sizeof (cma_page_t), FMD_SLEEP);
195 	page->pg_addr = pageaddr;
196 	page->pg_fmri = fmri;
197 	if (uuid != NULL)
198 		page->pg_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);
199 
200 	page->pg_next = cma.cma_pages;
201 	cma.cma_pages = page;
202 
203 	if (cma.cma_page_timerid != 0)
204 		fmd_timer_remove(hdl, cma.cma_page_timerid);
205 
206 	cma.cma_page_curdelay = cma.cma_page_mindelay;
207 
208 	cma.cma_page_timerid =
209 	    fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay);
210 
211 	/* Don't free fmri here.  This FMRI will be needed for retry. */
212 	return (CMA_RA_FAILURE);
213 }
214 
215 static int
216 page_retry(fmd_hdl_t *hdl, cma_page_t *page)
217 {
218 	int rc;
219 
220 	if (page->pg_fmri != NULL && !fmd_nvl_fmri_present(hdl,
221 	    page->pg_fmri)) {
222 		fmd_hdl_debug(hdl, "page retire overtaken by events");
223 		cma_stats.page_nonent.fmds_value.ui64++;
224 
225 		if (page->pg_uuid != NULL)
226 			fmd_case_uuclose(hdl, page->pg_uuid);
227 		return (1); /* no longer a page to retire */
228 	}
229 
230 	rc = cma_fmri_page_service_state(hdl, page->pg_fmri);
231 	if (rc == FMD_SERVICE_STATE_UNUSABLE) {
232 		fmd_hdl_debug(hdl, "retired page 0x%llx on retry %u\n",
233 		    page->pg_addr, page->pg_nretries);
234 		cma_stats.page_flts.fmds_value.ui64++;
235 
236 		if (page->pg_uuid != NULL)
237 			fmd_case_uuclose(hdl, page->pg_uuid);
238 		return (1); /* page retired */
239 	}
240 
241 	if (rc == FMD_SERVICE_STATE_ISOLATE_PENDING) {
242 		fmd_hdl_debug(hdl, "scheduling another retry for 0x%llx\n",
243 		    page->pg_addr);
244 		return (0); /* schedule another retry */
245 	} else {
246 		fmd_hdl_debug(hdl, "failed to retry page 0x%llx "
247 		    "retirement: %s\n", page->pg_addr,
248 		    strerror(errno));
249 
250 		cma_stats.page_fails.fmds_value.ui64++;
251 		return (1); /* give up */
252 	}
253 }
254 
255 void
256 cma_page_retry(fmd_hdl_t *hdl)
257 {
258 	cma_page_t **pagep;
259 
260 	cma.cma_page_timerid = 0;
261 
262 	fmd_hdl_debug(hdl, "page_retry: timer fired\n");
263 
264 	pagep = &cma.cma_pages;
265 	while (*pagep != NULL) {
266 		cma_page_t *page = *pagep;
267 
268 		if (page_retry(hdl, page)) {
269 			/*
270 			 * Successful retry or we're giving up - remove from
271 			 * the list
272 			 */
273 			*pagep = page->pg_next;
274 
275 			if (page->pg_uuid != NULL)
276 				fmd_hdl_strfree(hdl, page->pg_uuid);
277 
278 			cma_page_free(hdl, page);
279 		} else {
280 			page->pg_nretries++;
281 			pagep = &page->pg_next;
282 		}
283 	}
284 
285 	if (cma.cma_pages == NULL)
286 		return; /* no more retirements */
287 
288 	/*
289 	 * We still have retirements that haven't completed.  Back the delay
290 	 * off, and schedule a retry.
291 	 */
292 	cma.cma_page_curdelay = MIN(cma.cma_page_curdelay * 2,
293 	    cma.cma_page_maxdelay);
294 
295 	fmd_hdl_debug(hdl, "scheduled page retirement retry for %llu secs\n",
296 	    (u_longlong_t)(cma.cma_page_curdelay / NANOSEC));
297 
298 	cma.cma_page_timerid =
299 	    fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay);
300 }
301 
302 void
303 cma_page_fini(fmd_hdl_t *hdl)
304 {
305 	cma_page_t *page;
306 
307 	while ((page = cma.cma_pages) != NULL) {
308 		cma.cma_pages = page->pg_next;
309 		cma_page_free(hdl, page);
310 	}
311 }
312