1 /* Copyright (C) 2009 Intel Corporation
2    Author: Andi Kleen
3    Memory error accounting per page
4 
5    mcelog is free software; you can redistribute it and/or
6    modify it under the terms of the GNU General Public
7    License as published by the Free Software Foundation; version
8    2.
9 
10    mcelog is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    General Public License for more details.
14 
15    You should find a copy of v2 of the GNU General Public License somewhere
16    on your Linux system; if not, write to the Free Software Foundation,
17    Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
18 
19 /* NB
20    investigate other data structures. Primary consideration would
21    be space efficiency. rbtree nodes are rather large.
22 
23    Do we need aging? Right now the only way to get rid of old nodes
24    is to restart. */
25 #define _GNU_SOURCE 1
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <unistd.h>
29 #include <fcntl.h>
30 #include <errno.h>
31 #include <string.h>
32 #include <sys/mman.h>
33 #include <assert.h>
34 #include "memutil.h"
35 #include "trigger.h"
36 #include "mcelog.h"
37 #include "rbtree.h"
38 #include "list.h"
39 #include "leaky-bucket.h"
40 #include "page.h"
41 #include "config.h"
42 #include "memdb.h"
43 #include "sysfs.h"
44 
45 #define PAGE_SHIFT 12
46 #define PAGE_SIZE (1UL << PAGE_SHIFT)
47 
48 enum { PAGE_ONLINE = 0, PAGE_OFFLINE = 1, PAGE_OFFLINE_FAILED = 2 };
49 
50 struct mempage {
51 	struct rb_node nd;
52 	/* one char used by rb_node */
53 	char offlined;
54 	char triggered;
55 	// 1(32bit)-5(64bit) bytes of padding to play with here
56 	u64 addr;
57 	struct err_type ce;
58 };
59 
60 #define N ((PAGE_SIZE - sizeof(struct list_head)) / sizeof(struct mempage))
61 #define to_cluster(mp)	(struct mempage_cluster *)((long)(mp) & ~((long)(PAGE_SIZE - 1)))
62 
63 struct mempage_cluster {
64 	struct list_head lru;
65 	struct mempage mp[N];
66 	int mp_used;
67 };
68 
69 struct mempage_replacement {
70 	struct leaky_bucket bucket;
71 	unsigned count;
72 };
73 
74 enum {
75 	MAX_ENV = 20,
76 };
77 
78 static int corr_err_counters;
79 static struct mempage_cluster *mp_cluster;
80 static struct mempage_replacement mp_repalcement;
81 static struct rb_root mempage_root;
82 static LIST_HEAD(mempage_cluster_lru_list);
83 static struct bucket_conf page_trigger_conf;
84 static struct bucket_conf mp_replacement_trigger_conf;
85 static char *page_error_pre_soft_trigger, *page_error_post_soft_trigger;
86 
87 static const char *page_state[] = {
88 	[PAGE_ONLINE] = "online",
89 	[PAGE_OFFLINE] = "offline",
90 	[PAGE_OFFLINE_FAILED] = "offline-failed",
91 };
92 
mempage_alloc(void)93 static struct mempage *mempage_alloc(void)
94 {
95 	if (!mp_cluster || mp_cluster->mp_used == N) {
96 		mp_cluster = mmap(0, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
97 		if (mp_cluster == MAP_FAILED)
98 			Enomem();
99 	}
100 
101 	return &mp_cluster->mp[mp_cluster->mp_used++];
102 }
103 
mempage_replace(void)104 static struct mempage *mempage_replace(void)
105 {
106 	/* If no free mp_cluster, reuse the last mp_cluster of the LRU list  */
107 	if (mp_cluster->mp_used == N) {
108 		mp_cluster = list_last_entry(&mempage_cluster_lru_list, struct mempage_cluster, lru);
109 		mp_cluster->mp_used = 0;
110 	}
111 
112 	return &mp_cluster->mp[mp_cluster->mp_used++];
113 }
114 
mempage_lookup(u64 addr)115 static struct mempage *mempage_lookup(u64 addr)
116 {
117 	struct rb_node *n = mempage_root.rb_node;
118 
119 	while (n) {
120 		struct mempage *mp = rb_entry(n, struct mempage, nd);
121 
122 		if (addr < mp->addr)
123 			n = n->rb_left;
124 		else if (addr > mp->addr)
125 			n = n->rb_right;
126 		else
127 			return mp;
128 	}
129 	return NULL;
130 }
131 
132 static struct mempage *
mempage_insert_lookup(u64 addr,struct rb_node * node)133 mempage_insert_lookup(u64 addr, struct rb_node * node)
134 {
135 	struct rb_node **p = &mempage_root.rb_node;
136 	struct rb_node *parent = NULL;
137 	struct mempage *mp;
138 
139 	while (*p) {
140 		parent = *p;
141 		mp = rb_entry(parent, struct mempage, nd);
142 
143 		if (addr < mp->addr)
144 			p = &(*p)->rb_left;
145 		else if (addr > mp->addr)
146 			p = &(*p)->rb_right;
147 		else
148 			return mp;
149 	}
150 	rb_link_node(node, parent, p);
151 	rb_insert_color(node, &mempage_root);
152 	return NULL;
153 }
154 
mempage_insert(u64 addr,struct mempage * mp)155 static struct mempage *mempage_insert(u64 addr, struct mempage *mp)
156 {
157 	mp->addr = addr;
158 	mp = mempage_insert_lookup(addr, &mp->nd);
159 	return mp;
160 }
161 
mempage_rb_tree_update(u64 addr,struct mempage * mp)162 static void mempage_rb_tree_update(u64 addr, struct mempage *mp)
163 {
164 	rb_erase(&mp->nd, &mempage_root);
165 	mempage_insert(addr, mp);
166 }
167 
mempage_cluster_lru_list_insert(struct mempage_cluster * mp_cluster)168 static void mempage_cluster_lru_list_insert(struct mempage_cluster *mp_cluster)
169 {
170 	list_add(&mp_cluster->lru, &mempage_cluster_lru_list);
171 }
172 
mempage_cluster_lru_list_update(struct mempage_cluster * mp_cluster)173 static void mempage_cluster_lru_list_update(struct mempage_cluster *mp_cluster)
174 {
175 	if (list_is_first(&mp_cluster->lru, &mempage_cluster_lru_list))
176 		return;
177 
178 	list_del(&mp_cluster->lru);
179 	list_add(&mp_cluster->lru, &mempage_cluster_lru_list);
180 }
181 
182 /* Following arrays need to be all kept in sync with the enum */
183 
184 enum otype {
185 	OFFLINE_OFF,
186 	OFFLINE_ACCOUNT,
187 	OFFLINE_SOFT,
188 	OFFLINE_HARD,
189 	OFFLINE_SOFT_THEN_HARD
190 };
191 
192 static const char *kernel_offline[] = {
193 	[OFFLINE_SOFT] = "/sys/devices/system/memory/soft_offline_page",
194 	[OFFLINE_HARD] = "/sys/devices/system/memory/hard_offline_page",
195 	[OFFLINE_SOFT_THEN_HARD] = "/sys/devices/system/memory/soft_offline_page"
196 };
197 
198 static struct config_choice offline_choice[] = {
199 	{ "off", OFFLINE_OFF },
200 	{ "account", OFFLINE_ACCOUNT },
201 	{ "soft", OFFLINE_SOFT },
202 	{ "hard", OFFLINE_HARD },
203 	{ "soft-then-hard", OFFLINE_SOFT_THEN_HARD },
204 	{}
205 };
206 
207 static enum otype offline = OFFLINE_OFF;
208 
do_memory_offline(u64 addr,enum otype type)209 static int do_memory_offline(u64 addr, enum otype type)
210 {
211 	return sysfs_write(kernel_offline[type], "%#llx", addr);
212 }
213 
memory_offline(u64 addr)214 static int memory_offline(u64 addr)
215 {
216 	if (offline == OFFLINE_SOFT_THEN_HARD) {
217 		if (do_memory_offline(addr, OFFLINE_SOFT) < 0)  {
218 			Lprintf("Soft offlining of page %llx failed, trying hard offlining\n",
219 				addr);
220 			return do_memory_offline(addr, OFFLINE_HARD);
221 		}
222 		return 0;
223 	}
224 	return do_memory_offline(addr, offline);
225 }
226 
offline_action(struct mempage * mp,u64 addr)227 static void offline_action(struct mempage *mp, u64 addr)
228 {
229 	if (offline <= OFFLINE_ACCOUNT)
230 		return;
231 	Lprintf("Offlining page %llx\n", addr);
232 	if (memory_offline(addr) < 0) {
233 		Lprintf("Offlining page %llx failed: %s\n", addr, strerror(errno));
234 		mp->offlined = PAGE_OFFLINE_FAILED;
235 	} else
236 		mp->offlined = PAGE_OFFLINE;
237 }
238 
239 /* Run a user defined trigger when the replacement threshold of page error counter crossed. */
counter_trigger(char * msg,time_t t,struct mempage_replacement * mr,struct bucket_conf * bc,bool sync)240 static void counter_trigger(char *msg, time_t t, struct mempage_replacement *mr,
241 			    struct bucket_conf *bc, bool sync)
242 {
243 	struct leaky_bucket *bk = &mr->bucket;
244 	char *env[MAX_ENV], *out, *thresh;
245 	int i, ei = 0;
246 
247 	thresh = bucket_output(bc, bk);
248 	xasprintf(&out, "%s: %s", msg, thresh);
249 
250 	if (bc->log)
251 		Gprintf("%s\n", out);
252 
253 	if (!bc->trigger)
254 		goto out;
255 
256 	xasprintf(&env[ei++], "THRESHOLD=%s", thresh);
257 	xasprintf(&env[ei++], "TOTALCOUNT=%lu", mr->count);
258 	if (t)
259 		xasprintf(&env[ei++], "LASTEVENT=%lu", t);
260 	xasprintf(&env[ei++], "AGETIME=%u", bc->agetime);
261 	xasprintf(&env[ei++], "MESSAGE=%s", out);
262 	xasprintf(&env[ei++], "THRESHOLD_COUNT=%d", bk->count);
263 	env[ei] = NULL;
264 	assert(ei < MAX_ENV);
265 
266 	run_trigger(bc->trigger, NULL, env, sync, "page-error-counter");
267 
268 	for (i = 0; i < ei; i++)
269 		free(env[i]);
270 out:
271 	free(out);
272 	free(thresh);
273 }
274 
account_page_error(struct mce * m,int channel,int dimm)275 void account_page_error(struct mce *m, int channel, int dimm)
276 {
277 	u64 addr = m->addr;
278 	struct mempage *mp;
279 	char *msg, *thresh;
280 	time_t t;
281 	unsigned cpu = m->extcpu ? m->extcpu : m->cpu;
282 
283 	if (offline == OFFLINE_OFF)
284 		return;
285 	if (!(m->status & MCI_STATUS_ADDRV)  || (m->status & MCI_STATUS_UC))
286 		return;
287 
288 	switch (cputype) {
289 	case CPU_SANDY_BRIDGE_EP:
290 		/*
291 		 * On SNB-EP platform we see corrected errors reported with
292 		 * address in Bank 5 from hardware (depending on BIOS setting),
293                  * in the meanwhile, a duplicate record constructed from
294                  * information found by "firmware first" APEI code. Ignore the
295                  * duplicate information so that we don't double count errors.
296 		 *
297 		 * NOTE: the record from APEI fake this error from CPU 0 BANK 1.
298 		 */
299 		if (m->bank == 1 && cpu == 0)
300 			return;
301 	default:
302 		break;
303 	}
304 
305 	t = m->time;
306 	addr &= ~((u64)PAGE_SIZE - 1);
307 	mp = mempage_lookup(addr);
308 	if (!mp && corr_err_counters < max_corr_err_counters) {
309 		mp = mempage_alloc();
310 		bucket_init(&mp->ce.bucket);
311 	        mempage_insert(addr, mp);
312 		mempage_cluster_lru_list_insert(to_cluster(mp));
313 		corr_err_counters++;
314 	} else if (!mp) {
315 		mp = mempage_replace();
316 		bucket_init(&mp->ce.bucket);
317 		mempage_rb_tree_update(addr, mp);
318 		mempage_cluster_lru_list_update(to_cluster(mp));
319 
320 		/* Report how often the replacement of counter 'mp' happened */
321 		++mp_repalcement.count;
322 		if (__bucket_account(&mp_replacement_trigger_conf, &mp_repalcement.bucket, 1, t)) {
323 			thresh = bucket_output(&mp_replacement_trigger_conf, &mp_repalcement.bucket);
324 			xasprintf(&msg, "Replacements of page correctable error counter exceed threshold %s", thresh);
325 			free(thresh);
326 
327 			counter_trigger(msg, t, &mp_repalcement, &mp_replacement_trigger_conf, false);
328 			free(msg);
329 		}
330 	} else {
331 		mempage_cluster_lru_list_update(to_cluster(mp));
332 	}
333 	++mp->ce.count;
334 	if (__bucket_account(&page_trigger_conf, &mp->ce.bucket, 1, t)) {
335 		struct memdimm *md;
336 
337 		if (mp->offlined != PAGE_ONLINE)
338 			return;
339 		/* Only do triggers and messages for online pages */
340 		thresh = bucket_output(&page_trigger_conf, &mp->ce.bucket);
341 		md = get_memdimm(m->socketid, channel, dimm, 1);
342 		xasprintf(&msg, "Corrected memory errors on page %llx exceed threshold %s",
343 			addr, thresh);
344 		free(thresh);
345 		memdb_trigger(msg, md, t, &mp->ce, &page_trigger_conf, NULL, false, "page");
346 		free(msg);
347 		mp->triggered = 1;
348 
349 		if (offline == OFFLINE_SOFT || offline == OFFLINE_SOFT_THEN_HARD) {
350 			struct bucket_conf page_soft_trigger_conf;
351 			char *argv[] = {
352 				NULL,
353 				NULL,
354 				NULL,
355 			};
356 			char *args;
357 
358 			asprintf(&args, "%lld", addr);
359 			argv[0]=args;
360 
361 			memcpy(&page_soft_trigger_conf, &page_trigger_conf, sizeof(struct bucket_conf));
362 			page_soft_trigger_conf.trigger = page_error_pre_soft_trigger;
363 			argv[0]=page_error_pre_soft_trigger;
364 			argv[1]=args;
365 			asprintf(&msg, "pre soft trigger run for page %lld", addr);
366 			memdb_trigger(msg, md, t, &mp->ce, &page_soft_trigger_conf, argv, true, "page_pre_soft");
367 			free(msg);
368 
369 			offline_action(mp, addr);
370 
371 			memcpy(&page_soft_trigger_conf, &page_trigger_conf, sizeof(struct bucket_conf));
372 			page_soft_trigger_conf.trigger = page_error_post_soft_trigger;
373 			argv[0]=page_error_post_soft_trigger;
374 			argv[1]=args;
375 			asprintf(&msg, "post soft trigger run for page %lld", addr);
376 			memdb_trigger(msg, md, t, &mp->ce, &page_soft_trigger_conf, argv, true, "page_post_soft");
377 			free(msg);
378 			free(args);
379 
380 		} else
381 			offline_action(mp, addr);
382 	}
383 }
384 
dump_page_errors(FILE * f)385 void dump_page_errors(FILE *f)
386 {
387 	char *msg;
388 	struct rb_node *r;
389 	long k;
390 
391 	k = 0;
392 	for (r = rb_first(&mempage_root); r; r = rb_next(r)) {
393 		struct mempage *p = rb_entry(r, struct mempage, nd);
394 
395 		if (k++ == 0)
396 			fprintf(f, "Per page corrected memory statistics:\n");
397 		msg = bucket_output(&page_trigger_conf, &p->ce.bucket);
398 		fprintf(f, "%llx: total %u seen \"%s\" %s%s\n",
399 			p->addr,
400 			p->ce.count,
401 			msg,
402 			page_state[(unsigned)p->offlined],
403 			p->triggered ? " triggered" : "");
404 		free(msg);
405 		fputc('\n', f);
406 	}
407 }
408 
page_setup(void)409 void page_setup(void)
410 {
411 	int n;
412 
413 	config_trigger("page", "memory-ce", &page_trigger_conf);
414 	config_trigger("page", "memory-ce-counter-replacement", &mp_replacement_trigger_conf);
415 	n = config_choice("page", "memory-ce-action", offline_choice);
416 	if (n >= 0)
417 		offline = n;
418 	if (offline > OFFLINE_ACCOUNT &&
419 	    !sysfs_available(kernel_offline[offline], W_OK)) {
420 		Lprintf("Kernel does not support page offline interface\n");
421 		offline = OFFLINE_ACCOUNT;
422 	}
423 
424 	page_error_pre_soft_trigger = config_string("page", "memory-pre-sync-soft-ce-trigger");
425 
426 	if (page_error_pre_soft_trigger && trigger_check(page_error_pre_soft_trigger) < 0) {
427 		SYSERRprintf("Cannot access page soft pre trigger `%s'",
428 				page_error_pre_soft_trigger);
429 		exit(1);
430 	}
431 
432 	page_error_post_soft_trigger= config_string("page", "memory-post-sync-soft-ce-trigger");
433 	if (page_error_post_soft_trigger && trigger_check(page_error_post_soft_trigger) < 0) {
434 		SYSERRprintf("Cannot access page soft post trigger `%s'",
435 				page_error_post_soft_trigger);
436 		exit(1);
437 	}
438 
439 	n = max_corr_err_counters;
440 	max_corr_err_counters = roundup(max_corr_err_counters, N);
441 	if (n != max_corr_err_counters)
442 		Lprintf("Round up max-corr-err-counters from %d to %d\n", n, max_corr_err_counters);
443 
444 	bucket_init(&mp_repalcement.bucket);
445 }
446