1 /* Copyright (C) 2009 Intel Corporation
2 Author: Andi Kleen
3 Memory error accounting per page
4
5 mcelog is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public
7 License as published by the Free Software Foundation; version
8 2.
9
10 mcelog is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should find a copy of v2 of the GNU General Public License somewhere
16 on your Linux system; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
18
19 /* NB
20 investigate other data structures. Primary consideration would
21 be space efficiency. rbtree nodes are rather large.
22
23 Do we need aging? Right now the only way to get rid of old nodes
24 is to restart. */
25 #define _GNU_SOURCE 1
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <unistd.h>
29 #include <fcntl.h>
30 #include <errno.h>
31 #include <string.h>
32 #include <sys/mman.h>
33 #include <assert.h>
34 #include "memutil.h"
35 #include "trigger.h"
36 #include "mcelog.h"
37 #include "rbtree.h"
38 #include "list.h"
39 #include "leaky-bucket.h"
40 #include "page.h"
41 #include "config.h"
42 #include "memdb.h"
43 #include "sysfs.h"
44
45 #define PAGE_SHIFT 12
46 #define PAGE_SIZE (1UL << PAGE_SHIFT)
47
48 enum { PAGE_ONLINE = 0, PAGE_OFFLINE = 1, PAGE_OFFLINE_FAILED = 2 };
49
50 struct mempage {
51 struct rb_node nd;
52 /* one char used by rb_node */
53 char offlined;
54 char triggered;
55 // 1(32bit)-5(64bit) bytes of padding to play with here
56 u64 addr;
57 struct err_type ce;
58 };
59
60 #define N ((PAGE_SIZE - sizeof(struct list_head)) / sizeof(struct mempage))
61 #define to_cluster(mp) (struct mempage_cluster *)((long)(mp) & ~((long)(PAGE_SIZE - 1)))
62
63 struct mempage_cluster {
64 struct list_head lru;
65 struct mempage mp[N];
66 int mp_used;
67 };
68
69 struct mempage_replacement {
70 struct leaky_bucket bucket;
71 unsigned count;
72 };
73
74 enum {
75 MAX_ENV = 20,
76 };
77
78 static int corr_err_counters;
79 static struct mempage_cluster *mp_cluster;
80 static struct mempage_replacement mp_repalcement;
81 static struct rb_root mempage_root;
82 static LIST_HEAD(mempage_cluster_lru_list);
83 static struct bucket_conf page_trigger_conf;
84 static struct bucket_conf mp_replacement_trigger_conf;
85 static char *page_error_pre_soft_trigger, *page_error_post_soft_trigger;
86
87 static const char *page_state[] = {
88 [PAGE_ONLINE] = "online",
89 [PAGE_OFFLINE] = "offline",
90 [PAGE_OFFLINE_FAILED] = "offline-failed",
91 };
92
mempage_alloc(void)93 static struct mempage *mempage_alloc(void)
94 {
95 if (!mp_cluster || mp_cluster->mp_used == N) {
96 mp_cluster = mmap(0, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
97 if (mp_cluster == MAP_FAILED)
98 Enomem();
99 }
100
101 return &mp_cluster->mp[mp_cluster->mp_used++];
102 }
103
mempage_replace(void)104 static struct mempage *mempage_replace(void)
105 {
106 /* If no free mp_cluster, reuse the last mp_cluster of the LRU list */
107 if (mp_cluster->mp_used == N) {
108 mp_cluster = list_last_entry(&mempage_cluster_lru_list, struct mempage_cluster, lru);
109 mp_cluster->mp_used = 0;
110 }
111
112 return &mp_cluster->mp[mp_cluster->mp_used++];
113 }
114
mempage_lookup(u64 addr)115 static struct mempage *mempage_lookup(u64 addr)
116 {
117 struct rb_node *n = mempage_root.rb_node;
118
119 while (n) {
120 struct mempage *mp = rb_entry(n, struct mempage, nd);
121
122 if (addr < mp->addr)
123 n = n->rb_left;
124 else if (addr > mp->addr)
125 n = n->rb_right;
126 else
127 return mp;
128 }
129 return NULL;
130 }
131
132 static struct mempage *
mempage_insert_lookup(u64 addr,struct rb_node * node)133 mempage_insert_lookup(u64 addr, struct rb_node * node)
134 {
135 struct rb_node **p = &mempage_root.rb_node;
136 struct rb_node *parent = NULL;
137 struct mempage *mp;
138
139 while (*p) {
140 parent = *p;
141 mp = rb_entry(parent, struct mempage, nd);
142
143 if (addr < mp->addr)
144 p = &(*p)->rb_left;
145 else if (addr > mp->addr)
146 p = &(*p)->rb_right;
147 else
148 return mp;
149 }
150 rb_link_node(node, parent, p);
151 rb_insert_color(node, &mempage_root);
152 return NULL;
153 }
154
mempage_insert(u64 addr,struct mempage * mp)155 static struct mempage *mempage_insert(u64 addr, struct mempage *mp)
156 {
157 mp->addr = addr;
158 mp = mempage_insert_lookup(addr, &mp->nd);
159 return mp;
160 }
161
mempage_rb_tree_update(u64 addr,struct mempage * mp)162 static void mempage_rb_tree_update(u64 addr, struct mempage *mp)
163 {
164 rb_erase(&mp->nd, &mempage_root);
165 mempage_insert(addr, mp);
166 }
167
mempage_cluster_lru_list_insert(struct mempage_cluster * mp_cluster)168 static void mempage_cluster_lru_list_insert(struct mempage_cluster *mp_cluster)
169 {
170 list_add(&mp_cluster->lru, &mempage_cluster_lru_list);
171 }
172
mempage_cluster_lru_list_update(struct mempage_cluster * mp_cluster)173 static void mempage_cluster_lru_list_update(struct mempage_cluster *mp_cluster)
174 {
175 if (list_is_first(&mp_cluster->lru, &mempage_cluster_lru_list))
176 return;
177
178 list_del(&mp_cluster->lru);
179 list_add(&mp_cluster->lru, &mempage_cluster_lru_list);
180 }
181
182 /* Following arrays need to be all kept in sync with the enum */
183
184 enum otype {
185 OFFLINE_OFF,
186 OFFLINE_ACCOUNT,
187 OFFLINE_SOFT,
188 OFFLINE_HARD,
189 OFFLINE_SOFT_THEN_HARD
190 };
191
192 static const char *kernel_offline[] = {
193 [OFFLINE_SOFT] = "/sys/devices/system/memory/soft_offline_page",
194 [OFFLINE_HARD] = "/sys/devices/system/memory/hard_offline_page",
195 [OFFLINE_SOFT_THEN_HARD] = "/sys/devices/system/memory/soft_offline_page"
196 };
197
198 static struct config_choice offline_choice[] = {
199 { "off", OFFLINE_OFF },
200 { "account", OFFLINE_ACCOUNT },
201 { "soft", OFFLINE_SOFT },
202 { "hard", OFFLINE_HARD },
203 { "soft-then-hard", OFFLINE_SOFT_THEN_HARD },
204 {}
205 };
206
207 static enum otype offline = OFFLINE_OFF;
208
do_memory_offline(u64 addr,enum otype type)209 static int do_memory_offline(u64 addr, enum otype type)
210 {
211 return sysfs_write(kernel_offline[type], "%#llx", addr);
212 }
213
memory_offline(u64 addr)214 static int memory_offline(u64 addr)
215 {
216 if (offline == OFFLINE_SOFT_THEN_HARD) {
217 if (do_memory_offline(addr, OFFLINE_SOFT) < 0) {
218 Lprintf("Soft offlining of page %llx failed, trying hard offlining\n",
219 addr);
220 return do_memory_offline(addr, OFFLINE_HARD);
221 }
222 return 0;
223 }
224 return do_memory_offline(addr, offline);
225 }
226
offline_action(struct mempage * mp,u64 addr)227 static void offline_action(struct mempage *mp, u64 addr)
228 {
229 if (offline <= OFFLINE_ACCOUNT)
230 return;
231 Lprintf("Offlining page %llx\n", addr);
232 if (memory_offline(addr) < 0) {
233 Lprintf("Offlining page %llx failed: %s\n", addr, strerror(errno));
234 mp->offlined = PAGE_OFFLINE_FAILED;
235 } else
236 mp->offlined = PAGE_OFFLINE;
237 }
238
239 /* Run a user defined trigger when the replacement threshold of page error counter crossed. */
counter_trigger(char * msg,time_t t,struct mempage_replacement * mr,struct bucket_conf * bc,bool sync)240 static void counter_trigger(char *msg, time_t t, struct mempage_replacement *mr,
241 struct bucket_conf *bc, bool sync)
242 {
243 struct leaky_bucket *bk = &mr->bucket;
244 char *env[MAX_ENV], *out, *thresh;
245 int i, ei = 0;
246
247 thresh = bucket_output(bc, bk);
248 xasprintf(&out, "%s: %s", msg, thresh);
249
250 if (bc->log)
251 Gprintf("%s\n", out);
252
253 if (!bc->trigger)
254 goto out;
255
256 xasprintf(&env[ei++], "THRESHOLD=%s", thresh);
257 xasprintf(&env[ei++], "TOTALCOUNT=%lu", mr->count);
258 if (t)
259 xasprintf(&env[ei++], "LASTEVENT=%lu", t);
260 xasprintf(&env[ei++], "AGETIME=%u", bc->agetime);
261 xasprintf(&env[ei++], "MESSAGE=%s", out);
262 xasprintf(&env[ei++], "THRESHOLD_COUNT=%d", bk->count);
263 env[ei] = NULL;
264 assert(ei < MAX_ENV);
265
266 run_trigger(bc->trigger, NULL, env, sync, "page-error-counter");
267
268 for (i = 0; i < ei; i++)
269 free(env[i]);
270 out:
271 free(out);
272 free(thresh);
273 }
274
account_page_error(struct mce * m,int channel,int dimm)275 void account_page_error(struct mce *m, int channel, int dimm)
276 {
277 u64 addr = m->addr;
278 struct mempage *mp;
279 char *msg, *thresh;
280 time_t t;
281 unsigned cpu = m->extcpu ? m->extcpu : m->cpu;
282
283 if (offline == OFFLINE_OFF)
284 return;
285 if (!(m->status & MCI_STATUS_ADDRV) || (m->status & MCI_STATUS_UC))
286 return;
287
288 switch (cputype) {
289 case CPU_SANDY_BRIDGE_EP:
290 /*
291 * On SNB-EP platform we see corrected errors reported with
292 * address in Bank 5 from hardware (depending on BIOS setting),
293 * in the meanwhile, a duplicate record constructed from
294 * information found by "firmware first" APEI code. Ignore the
295 * duplicate information so that we don't double count errors.
296 *
297 * NOTE: the record from APEI fake this error from CPU 0 BANK 1.
298 */
299 if (m->bank == 1 && cpu == 0)
300 return;
301 default:
302 break;
303 }
304
305 t = m->time;
306 addr &= ~((u64)PAGE_SIZE - 1);
307 mp = mempage_lookup(addr);
308 if (!mp && corr_err_counters < max_corr_err_counters) {
309 mp = mempage_alloc();
310 bucket_init(&mp->ce.bucket);
311 mempage_insert(addr, mp);
312 mempage_cluster_lru_list_insert(to_cluster(mp));
313 corr_err_counters++;
314 } else if (!mp) {
315 mp = mempage_replace();
316 bucket_init(&mp->ce.bucket);
317 mempage_rb_tree_update(addr, mp);
318 mempage_cluster_lru_list_update(to_cluster(mp));
319
320 /* Report how often the replacement of counter 'mp' happened */
321 ++mp_repalcement.count;
322 if (__bucket_account(&mp_replacement_trigger_conf, &mp_repalcement.bucket, 1, t)) {
323 thresh = bucket_output(&mp_replacement_trigger_conf, &mp_repalcement.bucket);
324 xasprintf(&msg, "Replacements of page correctable error counter exceed threshold %s", thresh);
325 free(thresh);
326
327 counter_trigger(msg, t, &mp_repalcement, &mp_replacement_trigger_conf, false);
328 free(msg);
329 }
330 } else {
331 mempage_cluster_lru_list_update(to_cluster(mp));
332 }
333 ++mp->ce.count;
334 if (__bucket_account(&page_trigger_conf, &mp->ce.bucket, 1, t)) {
335 struct memdimm *md;
336
337 if (mp->offlined != PAGE_ONLINE)
338 return;
339 /* Only do triggers and messages for online pages */
340 thresh = bucket_output(&page_trigger_conf, &mp->ce.bucket);
341 md = get_memdimm(m->socketid, channel, dimm, 1);
342 xasprintf(&msg, "Corrected memory errors on page %llx exceed threshold %s",
343 addr, thresh);
344 free(thresh);
345 memdb_trigger(msg, md, t, &mp->ce, &page_trigger_conf, NULL, false, "page");
346 free(msg);
347 mp->triggered = 1;
348
349 if (offline == OFFLINE_SOFT || offline == OFFLINE_SOFT_THEN_HARD) {
350 struct bucket_conf page_soft_trigger_conf;
351 char *argv[] = {
352 NULL,
353 NULL,
354 NULL,
355 };
356 char *args;
357
358 asprintf(&args, "%lld", addr);
359 argv[0]=args;
360
361 memcpy(&page_soft_trigger_conf, &page_trigger_conf, sizeof(struct bucket_conf));
362 page_soft_trigger_conf.trigger = page_error_pre_soft_trigger;
363 argv[0]=page_error_pre_soft_trigger;
364 argv[1]=args;
365 asprintf(&msg, "pre soft trigger run for page %lld", addr);
366 memdb_trigger(msg, md, t, &mp->ce, &page_soft_trigger_conf, argv, true, "page_pre_soft");
367 free(msg);
368
369 offline_action(mp, addr);
370
371 memcpy(&page_soft_trigger_conf, &page_trigger_conf, sizeof(struct bucket_conf));
372 page_soft_trigger_conf.trigger = page_error_post_soft_trigger;
373 argv[0]=page_error_post_soft_trigger;
374 argv[1]=args;
375 asprintf(&msg, "post soft trigger run for page %lld", addr);
376 memdb_trigger(msg, md, t, &mp->ce, &page_soft_trigger_conf, argv, true, "page_post_soft");
377 free(msg);
378 free(args);
379
380 } else
381 offline_action(mp, addr);
382 }
383 }
384
dump_page_errors(FILE * f)385 void dump_page_errors(FILE *f)
386 {
387 char *msg;
388 struct rb_node *r;
389 long k;
390
391 k = 0;
392 for (r = rb_first(&mempage_root); r; r = rb_next(r)) {
393 struct mempage *p = rb_entry(r, struct mempage, nd);
394
395 if (k++ == 0)
396 fprintf(f, "Per page corrected memory statistics:\n");
397 msg = bucket_output(&page_trigger_conf, &p->ce.bucket);
398 fprintf(f, "%llx: total %u seen \"%s\" %s%s\n",
399 p->addr,
400 p->ce.count,
401 msg,
402 page_state[(unsigned)p->offlined],
403 p->triggered ? " triggered" : "");
404 free(msg);
405 fputc('\n', f);
406 }
407 }
408
page_setup(void)409 void page_setup(void)
410 {
411 int n;
412
413 config_trigger("page", "memory-ce", &page_trigger_conf);
414 config_trigger("page", "memory-ce-counter-replacement", &mp_replacement_trigger_conf);
415 n = config_choice("page", "memory-ce-action", offline_choice);
416 if (n >= 0)
417 offline = n;
418 if (offline > OFFLINE_ACCOUNT &&
419 !sysfs_available(kernel_offline[offline], W_OK)) {
420 Lprintf("Kernel does not support page offline interface\n");
421 offline = OFFLINE_ACCOUNT;
422 }
423
424 page_error_pre_soft_trigger = config_string("page", "memory-pre-sync-soft-ce-trigger");
425
426 if (page_error_pre_soft_trigger && trigger_check(page_error_pre_soft_trigger) < 0) {
427 SYSERRprintf("Cannot access page soft pre trigger `%s'",
428 page_error_pre_soft_trigger);
429 exit(1);
430 }
431
432 page_error_post_soft_trigger= config_string("page", "memory-post-sync-soft-ce-trigger");
433 if (page_error_post_soft_trigger && trigger_check(page_error_post_soft_trigger) < 0) {
434 SYSERRprintf("Cannot access page soft post trigger `%s'",
435 page_error_post_soft_trigger);
436 exit(1);
437 }
438
439 n = max_corr_err_counters;
440 max_corr_err_counters = roundup(max_corr_err_counters, N);
441 if (n != max_corr_err_counters)
442 Lprintf("Round up max-corr-err-counters from %d to %d\n", n, max_corr_err_counters);
443
444 bucket_init(&mp_repalcement.bucket);
445 }
446