1 /* $NetBSD: kern_heartbeat.c,v 1.5 2023/07/16 10:18:19 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * heartbeat(9) -- periodic checks to ensure CPUs are making progress
31 *
32 * Manual tests to run when changing this file. Magic numbers are for
33 * evbarm; adjust for other platforms. Tests involving cpuctl
34 * online/offline assume a 2-CPU system -- for full testing on a >2-CPU
35 * system, offline all but one CPU.
36 *
37 * 1. cpuctl offline 0
38 * sleep 20
39 * cpuctl online 0
40 *
41 * 2. cpuctl offline 1
42 * sleep 20
43 * cpuctl online 1
44 *
45 * 3. cpuctl offline 0
46 * sysctl -w kern.heartbeat.max_period=5
47 * sleep 10
48 * sysctl -w kern.heartbeat.max_period=0
49 * sleep 10
50 * sysctl -w kern.heartbeat.max_period=5
51 * sleep 10
52 * cpuctl online 0
53 *
54 * 4. sysctl -w debug.crashme_enable=1
55 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK
56 * # verify system panics after 15sec
57 *
58 * 5. sysctl -w debug.crashme_enable=1
59 * sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED
60 * # verify system panics after 15sec
61 *
62 * 6. cpuctl offline 0
63 * sysctl -w debug.crashme_enable=1
64 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK
65 * # verify system panics after 15sec
66 *
67 * 7. cpuctl offline 0
68 * sysctl -w debug.crashme_enable=1
69 * sysctl -w debug.crashme.spl_spinout=5 # IPL_VM
70 * # verify system panics after 15sec
71 *
72 * # Not this -- IPL_SCHED and IPL_HIGH spinout on a single CPU
73 * # require a hardware watchdog timer.
74 * #cpuctl offline 0
75 * #sysctl -w debug.crashme_enable
76 * #sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED
77 * # hope watchdog timer kicks in
78 */
79
80 #include <sys/cdefs.h>
81 __KERNEL_RCSID(0, "$NetBSD: kern_heartbeat.c,v 1.5 2023/07/16 10:18:19 riastradh Exp $");
82
83 #ifdef _KERNEL_OPT
84 #include "opt_ddb.h"
85 #include "opt_heartbeat.h"
86 #endif
87
88 #include "heartbeat.h"
89
90 #include <sys/param.h>
91 #include <sys/types.h>
92
93 #include <sys/atomic.h>
94 #include <sys/cpu.h>
95 #include <sys/errno.h>
96 #include <sys/heartbeat.h>
97 #include <sys/ipi.h>
98 #include <sys/kernel.h>
99 #include <sys/mutex.h>
100 #include <sys/sysctl.h>
101 #include <sys/systm.h>
102 #include <sys/xcall.h>
103
104 #ifdef DDB
105 #include <ddb/ddb.h>
106 #endif
107
108 /*
109 * Global state.
110 *
111 * heartbeat_lock serializes access to heartbeat_max_period_secs
112 * and heartbeat_max_period_ticks. Two separate variables so we
113 * can avoid multiplication or division in the heartbeat routine.
114 *
115 * heartbeat_sih is stable after initialization in
116 * heartbeat_start.
117 */
118 kmutex_t heartbeat_lock __cacheline_aligned;
119 unsigned heartbeat_max_period_secs __read_mostly;
120 unsigned heartbeat_max_period_ticks __read_mostly;
121
122 void *heartbeat_sih __read_mostly;
123
124 /*
125 * heartbeat_suspend()
126 *
127 * Suspend heartbeat monitoring of the current CPU.
128 *
129 * Called after the current CPU has been marked offline but before
130 * it has stopped running. Caller must have preemption disabled.
131 */
132 void
heartbeat_suspend(void)133 heartbeat_suspend(void)
134 {
135
136 KASSERT(curcpu_stable());
137
138 /*
139 * Nothing to do -- we just check the SPCF_OFFLINE flag.
140 */
141 }
142
143 /*
144 * heartbeat_resume_cpu(ci)
145 *
146 * Resume heartbeat monitoring of ci.
147 *
148 * Called at startup while cold, and whenever heartbeat monitoring
149 * is re-enabled after being disabled or the period is changed.
150 * When not cold, ci must be the current CPU.
151 */
152 static void
heartbeat_resume_cpu(struct cpu_info * ci)153 heartbeat_resume_cpu(struct cpu_info *ci)
154 {
155
156 KASSERT(__predict_false(cold) || curcpu_stable());
157 KASSERT(__predict_false(cold) || ci == curcpu());
158
159 ci->ci_heartbeat_count = 0;
160 ci->ci_heartbeat_uptime_cache = time_uptime;
161 ci->ci_heartbeat_uptime_stamp = 0;
162 }
163
164 /*
165 * heartbeat_resume()
166 *
167 * Resume heartbeat monitoring of the current CPU.
168 *
169 * Called after the current CPU has started running but before it
170 * has been marked online. Also used internally when starting up
171 * heartbeat monitoring at boot or when the maximum period is set
172 * from zero to nonzero. Caller must have preemption disabled.
173 */
174 void
heartbeat_resume(void)175 heartbeat_resume(void)
176 {
177 struct cpu_info *ci = curcpu();
178 int s;
179
180 KASSERT(curcpu_stable());
181
182 /*
183 * Block heartbeats while we reset the state so we don't
184 * spuriously think we had a heart attack in the middle of
185 * resetting the count and the uptime stamp.
186 */
187 s = splsched();
188 heartbeat_resume_cpu(ci);
189 splx(s);
190 }
191
192 /*
193 * heartbeat_reset_xc(a, b)
194 *
195 * Cross-call handler to reset heartbeat state just prior to
196 * enabling heartbeat checks.
197 */
198 static void
heartbeat_reset_xc(void * a,void * b)199 heartbeat_reset_xc(void *a, void *b)
200 {
201
202 heartbeat_resume();
203 }
204
205 /*
206 * set_max_period(max_period)
207 *
208 * Set the maximum period, in seconds, for heartbeat checks.
209 *
210 * - If max_period is zero, disable them.
211 *
212 * - If the max period was zero and max_period is nonzero, ensure
213 * all CPUs' heartbeat uptime caches are up-to-date before
214 * re-enabling them.
215 *
216 * max_period must be below UINT_MAX/4/hz to avoid arithmetic
217 * overflow and give room for slop.
218 *
219 * Caller must hold heartbeat_lock.
220 */
221 static void
set_max_period(unsigned max_period)222 set_max_period(unsigned max_period)
223 {
224
225 KASSERTMSG(max_period <= UINT_MAX/4/hz,
226 "max_period=%u must not exceed UINT_MAX/4/hz=%u (hz=%u)",
227 max_period, UINT_MAX/4/hz, hz);
228 KASSERT(mutex_owned(&heartbeat_lock));
229
230 /*
231 * If we're enabling heartbeat checks, make sure we have a
232 * reasonably up-to-date time_uptime cache on all CPUs so we
233 * don't think we had an instant heart attack.
234 */
235 if (heartbeat_max_period_secs == 0 && max_period != 0) {
236 if (cold) {
237 CPU_INFO_ITERATOR cii;
238 struct cpu_info *ci;
239
240 for (CPU_INFO_FOREACH(cii, ci))
241 heartbeat_resume_cpu(ci);
242 } else {
243 const uint64_t ticket =
244 xc_broadcast(0, &heartbeat_reset_xc, NULL, NULL);
245 xc_wait(ticket);
246 }
247 }
248
249 /*
250 * Once the heartbeat state has been updated on all (online)
251 * CPUs, set the period. At this point, heartbeat checks can
252 * begin.
253 */
254 atomic_store_relaxed(&heartbeat_max_period_secs, max_period);
255 atomic_store_relaxed(&heartbeat_max_period_ticks, max_period*hz);
256 }
257
258 /*
259 * heartbeat_max_period_ticks(SYSCTLFN_ARGS)
260 *
261 * Sysctl handler for sysctl kern.heartbeat.max_period. Verifies
262 * it lies within a reasonable interval and sets it.
263 */
264 static int
heartbeat_max_period_sysctl(SYSCTLFN_ARGS)265 heartbeat_max_period_sysctl(SYSCTLFN_ARGS)
266 {
267 struct sysctlnode node;
268 unsigned max_period;
269 int error;
270
271 mutex_enter(&heartbeat_lock);
272
273 max_period = heartbeat_max_period_secs;
274 node = *rnode;
275 node.sysctl_data = &max_period;
276 error = sysctl_lookup(SYSCTLFN_CALL(&node));
277 if (error || newp == NULL)
278 goto out;
279
280 /*
281 * Ensure there's plenty of slop between heartbeats.
282 */
283 if (max_period > UINT_MAX/4/hz) {
284 error = EOVERFLOW;
285 goto out;
286 }
287
288 /*
289 * Success! Set the period. This enables heartbeat checks if
290 * we went from zero period to nonzero period, or disables them
291 * if the other way around.
292 */
293 set_max_period(max_period);
294 error = 0;
295
296 out: mutex_exit(&heartbeat_lock);
297 return error;
298 }
299
300 /*
301 * sysctl_heartbeat_setup()
302 *
303 * Set up the kern.heartbeat.* sysctl subtree.
304 */
305 SYSCTL_SETUP(sysctl_heartbeat_setup, "sysctl kern.heartbeat setup")
306 {
307 const struct sysctlnode *rnode;
308 int error;
309
310 mutex_init(&heartbeat_lock, MUTEX_DEFAULT, IPL_NONE);
311
312 /* kern.heartbeat */
313 error = sysctl_createv(NULL, 0, NULL, &rnode,
314 CTLFLAG_PERMANENT,
315 CTLTYPE_NODE, "heartbeat",
316 SYSCTL_DESCR("Kernel heartbeat parameters"),
317 NULL, 0, NULL, 0,
318 CTL_KERN, CTL_CREATE, CTL_EOL);
319 if (error) {
320 printf("%s: failed to create kern.heartbeat: %d\n",
321 __func__, error);
322 return;
323 }
324
325 /* kern.heartbeat.max_period */
326 error = sysctl_createv(NULL, 0, &rnode, NULL,
327 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
328 CTLTYPE_INT, "max_period",
329 SYSCTL_DESCR("Max seconds between heartbeats before panic"),
330 &heartbeat_max_period_sysctl, 0, NULL, 0,
331 CTL_CREATE, CTL_EOL);
332 if (error) {
333 printf("%s: failed to create kern.heartbeat.max_period: %d\n",
334 __func__, error);
335 return;
336 }
337 }
338
339 /*
340 * heartbeat_intr(cookie)
341 *
342 * Soft interrupt handler to update the local CPU's view of the
343 * system uptime. This runs at the same priority level as
344 * callouts, so if callouts are stuck on this CPU, it won't run,
345 * and eventually another CPU will notice that this one is stuck.
346 *
347 * Don't do spl* here -- keep it to a minimum so if anything goes
348 * wrong we don't end up with hard interrupts blocked and unable
349 * to detect a missed heartbeat.
350 */
351 static void
heartbeat_intr(void * cookie)352 heartbeat_intr(void *cookie)
353 {
354 unsigned count = atomic_load_relaxed(&curcpu()->ci_heartbeat_count);
355 unsigned uptime = time_uptime;
356
357 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_stamp, count);
358 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_cache, uptime);
359 }
360
361 /*
362 * heartbeat_start()
363 *
364 * Start system heartbeat monitoring.
365 */
366 void
heartbeat_start(void)367 heartbeat_start(void)
368 {
369 const unsigned max_period = HEARTBEAT_MAX_PERIOD_DEFAULT;
370
371 /*
372 * Establish a softint so we can schedule it once ready. This
373 * should be at the lowest softint priority level so that we
374 * ensure all softint priorities are making progress.
375 */
376 heartbeat_sih = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE,
377 &heartbeat_intr, NULL);
378
379 /*
380 * Now that the softint is established, kick off heartbeat
381 * monitoring with the default period. This will initialize
382 * the per-CPU state to an up-to-date cache of time_uptime.
383 */
384 mutex_enter(&heartbeat_lock);
385 set_max_period(max_period);
386 mutex_exit(&heartbeat_lock);
387 }
388
389 /*
390 * defibrillator(cookie)
391 *
392 * IPI handler for defibrillation. If the CPU's heart has stopped
393 * beating normally, but the CPU can still execute things,
394 * acknowledge the IPI to the doctor and then panic so we at least
395 * get a stack trace from whatever the current CPU is stuck doing,
396 * if not a core dump.
397 *
398 * (This metaphor is a little stretched, since defibrillation is
399 * usually administered when the heart is beating errattically but
400 * hasn't stopped, and causes the heart to stop temporarily, and
401 * one hopes it is not fatal. But we're (software) engineers, so
402 * we can stretch metaphors like silly putty in a blender.)
403 */
404 static void
defibrillator(void * cookie)405 defibrillator(void *cookie)
406 {
407 bool *ack = cookie;
408
409 atomic_store_relaxed(ack, true);
410 panic("%s[%d %s]: heart stopped beating", cpu_name(curcpu()),
411 curlwp->l_lid,
412 curlwp->l_name ? curlwp->l_name : curproc->p_comm);
413 }
414
415 /*
416 * defibrillate(ci, unsigned d)
417 *
418 * The patient CPU ci's heart has stopped beating after d seconds.
419 * Force the patient CPU ci to panic, or panic on this CPU if the
420 * patient CPU doesn't respond within 1sec.
421 */
422 static void __noinline
defibrillate(struct cpu_info * ci,unsigned d)423 defibrillate(struct cpu_info *ci, unsigned d)
424 {
425 bool ack = false;
426 ipi_msg_t msg = {
427 .func = &defibrillator,
428 .arg = &ack,
429 };
430 unsigned countdown = 1000; /* 1sec */
431
432 KASSERT(curcpu_stable());
433
434 /*
435 * First notify the console that the patient CPU's heart seems
436 * to have stopped beating.
437 */
438 printf("%s: found %s heart stopped beating after %u seconds\n",
439 cpu_name(curcpu()), cpu_name(ci), d);
440
441 /*
442 * Next, give the patient CPU a chance to panic, so we get a
443 * stack trace on that CPU even if we don't get a crash dump.
444 */
445 ipi_unicast(&msg, ci);
446
447 /*
448 * Busy-wait up to 1sec for the patient CPU to print a stack
449 * trace and panic. If the patient CPU acknowledges the IPI,
450 * or if we're panicking anyway, just give up and stop here --
451 * the system is coming down soon and we should avoid getting
452 * in the way.
453 */
454 while (countdown --> 0) {
455 if (atomic_load_relaxed(&ack) ||
456 atomic_load_relaxed(&panicstr) != NULL)
457 return;
458 DELAY(1000); /* 1ms */
459 }
460
461 /*
462 * The patient CPU failed to acknowledge the panic request.
463 * Panic now; with any luck, we'll get a crash dump.
464 */
465 panic("%s: found %s heart stopped beating and unresponsive",
466 cpu_name(curcpu()), cpu_name(ci));
467 }
468
469 /*
470 * select_patient()
471 *
472 * Select another CPU to check the heartbeat of. Returns NULL if
473 * there are no other online CPUs. Never returns curcpu().
474 * Caller must have kpreemption disabled.
475 */
476 static struct cpu_info *
select_patient(void)477 select_patient(void)
478 {
479 CPU_INFO_ITERATOR cii;
480 struct cpu_info *first = NULL, *patient = NULL, *ci;
481 bool passedcur = false;
482
483 KASSERT(curcpu_stable());
484
485 /*
486 * In the iteration order of all CPUs, find the next online CPU
487 * after curcpu(), or the first online one if curcpu() is last
488 * in the iteration order.
489 */
490 for (CPU_INFO_FOREACH(cii, ci)) {
491 if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE)
492 continue;
493 if (passedcur) {
494 /*
495 * (...|curcpu()|ci|...)
496 *
497 * Found the patient right after curcpu().
498 */
499 KASSERT(patient != ci);
500 patient = ci;
501 break;
502 }
503 if (ci == curcpu()) {
504 /*
505 * (...|prev|ci=curcpu()|next|...)
506 *
507 * Note that we want next (or first, if there's
508 * nothing after curcpu()).
509 */
510 passedcur = true;
511 continue;
512 }
513 if (first == NULL) {
514 /*
515 * (ci|...|curcpu()|...)
516 *
517 * Record ci as first in case there's nothing
518 * after curcpu().
519 */
520 first = ci;
521 continue;
522 }
523 }
524
525 /*
526 * If we hit the end, wrap around to the beginning.
527 */
528 if (patient == NULL) {
529 KASSERT(passedcur);
530 patient = first;
531 }
532
533 return patient;
534 }
535
536 /*
537 * heartbeat()
538 *
539 * 1. Count a heartbeat on the local CPU.
540 *
541 * 2. Panic if the system uptime doesn't seem to have advanced in
542 * a while.
543 *
544 * 3. Panic if the soft interrupt on this CPU hasn't advanced the
545 * local view of the system uptime.
546 *
547 * 4. Schedule the soft interrupt to advance the local view of the
548 * system uptime.
549 *
550 * 5. Select another CPU to check the heartbeat of.
551 *
552 * 6. Panic if the other CPU hasn't advanced its view of the
553 * system uptime in a while.
554 */
555 void
heartbeat(void)556 heartbeat(void)
557 {
558 unsigned period_ticks, period_secs;
559 unsigned count, uptime, cache, stamp, d;
560 struct cpu_info *patient;
561
562 KASSERT(curcpu_stable());
563
564 period_ticks = atomic_load_relaxed(&heartbeat_max_period_ticks);
565 period_secs = atomic_load_relaxed(&heartbeat_max_period_secs);
566 if (__predict_false(period_ticks == 0) ||
567 __predict_false(period_secs == 0) ||
568 __predict_false(curcpu()->ci_schedstate.spc_flags & SPCF_OFFLINE))
569 return;
570
571 /*
572 * Count a heartbeat on this CPU.
573 */
574 count = curcpu()->ci_heartbeat_count++;
575
576 /*
577 * If the uptime hasn't changed, make sure that we haven't
578 * counted too many of our own heartbeats since the uptime last
579 * changed, and stop here -- we only do the cross-CPU work once
580 * per second.
581 */
582 uptime = time_uptime;
583 cache = atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_cache);
584 if (__predict_true(cache == uptime)) {
585 /*
586 * Timecounter hasn't advanced by more than a second.
587 * Make sure the timecounter isn't stuck according to
588 * our heartbeats.
589 *
590 * Our own heartbeat count can't roll back, and
591 * time_uptime should be updated before it wraps
592 * around, so d should never go negative; hence no
593 * check for d < UINT_MAX/2.
594 */
595 stamp =
596 atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_stamp);
597 d = count - stamp;
598 if (__predict_false(d > period_ticks)) {
599 panic("%s: time has not advanced in %u heartbeats",
600 cpu_name(curcpu()), d);
601 }
602 return;
603 }
604
605 /*
606 * If the uptime has changed, make sure that it hasn't changed
607 * so much that softints must be stuck on this CPU. Since
608 * time_uptime is monotonic, this can't go negative, hence no
609 * check for d < UINT_MAX/2.
610 *
611 * This uses the hard timer interrupt handler on the current
612 * CPU to ensure soft interrupts at all priority levels have
613 * made progress.
614 */
615 d = uptime - cache;
616 if (__predict_false(d > period_secs)) {
617 panic("%s: softints stuck for %u seconds",
618 cpu_name(curcpu()), d);
619 }
620
621 /*
622 * Schedule a softint to update our cache of the system uptime
623 * so the next call to heartbeat, on this or another CPU, can
624 * detect progress on this one.
625 */
626 softint_schedule(heartbeat_sih);
627
628 /*
629 * Select a patient to check the heartbeat of. If there's no
630 * other online CPU, nothing to do.
631 */
632 patient = select_patient();
633 if (patient == NULL)
634 return;
635
636 /*
637 * Verify that time is advancing on the patient CPU. If the
638 * delta exceeds UINT_MAX/2, that means it is already ahead by
639 * a little on the other CPU, and the subtraction went
640 * negative, which is OK. If the CPU has been
641 * offlined since we selected it, no worries.
642 *
643 * This uses the current CPU to ensure the other CPU has made
644 * progress, even if the other CPU's hard timer interrupt
645 * handler is stuck for some reason.
646 *
647 * XXX Maybe confirm it hasn't gone negative by more than
648 * max_period?
649 */
650 d = uptime - atomic_load_relaxed(&patient->ci_heartbeat_uptime_cache);
651 if (__predict_false(d > period_secs) &&
652 __predict_false(d < UINT_MAX/2) &&
653 ((patient->ci_schedstate.spc_flags & SPCF_OFFLINE) == 0))
654 defibrillate(patient, d);
655 }
656
657 /*
658 * heartbeat_dump()
659 *
660 * Print the heartbeat data of all CPUs. Can be called from ddb.
661 */
662 #ifdef DDB
663 static unsigned
db_read_unsigned(const unsigned * p)664 db_read_unsigned(const unsigned *p)
665 {
666 unsigned x;
667
668 db_read_bytes((db_addr_t)p, sizeof(x), (char *)&x);
669
670 return x;
671 }
672
673 void
heartbeat_dump(void)674 heartbeat_dump(void)
675 {
676 struct cpu_info *ci;
677
678 db_printf("Heartbeats:\n");
679 for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
680 db_printf("cpu%u: count %u uptime %u stamp %u\n",
681 db_read_unsigned(&ci->ci_index),
682 db_read_unsigned(&ci->ci_heartbeat_count),
683 db_read_unsigned(&ci->ci_heartbeat_uptime_cache),
684 db_read_unsigned(&ci->ci_heartbeat_uptime_stamp));
685 }
686 }
687 #endif
688