1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/threading/hang_watcher.h"
6
7 #include <atomic>
8 #include <utility>
9
10 #include "base/bind.h"
11 #include "base/callback_helpers.h"
12 #include "base/containers/flat_map.h"
13 #include "base/debug/alias.h"
14 #include "base/debug/crash_logging.h"
15 #include "base/debug/dump_without_crashing.h"
16 #include "base/feature_list.h"
17 #include "base/metrics/field_trial_params.h"
18 #include "base/metrics/histogram_macros.h"
19 #include "base/no_destructor.h"
20 #include "base/ranges/algorithm.h"
21 #include "base/strings/string_number_conversions.h"
22 #include "base/synchronization/lock.h"
23 #include "base/synchronization/waitable_event.h"
24 #include "base/threading/platform_thread.h"
25 #include "base/threading/thread_checker.h"
26 #include "base/threading/thread_restrictions.h"
27 #include "base/time/default_tick_clock.h"
28 #include "base/time/time.h"
29 #include "build/build_config.h"
30
31 namespace base {
32
33 // static
34 constexpr base::Feature kEnableHangWatcher{"EnableHangWatcher",
35 base::FEATURE_DISABLED_BY_DEFAULT};
36 constexpr base::FeatureParam<bool> kHangWatchIOThread{
37 &kEnableHangWatcher, "hang_watch_io_thread", false};
38 constexpr base::FeatureParam<bool> kHangWatchUIThread{
39 &kEnableHangWatcher, "hang_watch_ui_thread", false};
40 constexpr base::FeatureParam<bool> kHangWatchThreadPool{
41 &kEnableHangWatcher, "hang_watch_threadpool", false};
42
43 constexpr base::TimeDelta HangWatchScopeEnabled::kDefaultHangWatchTime =
44 base::TimeDelta::FromSeconds(10);
45
46 namespace {
47 HangWatcher* g_instance = nullptr;
48 std::atomic<bool> g_use_hang_watcher{false};
49 std::atomic<bool> g_hang_watch_workers{false};
50 std::atomic<bool> g_hang_watch_io_thread{false};
51 std::atomic<bool> g_hang_watch_ui_thread{false};
52
53 // Emits the hung thread count histogram. |count| is the number of threads
54 // of type |thread_type| that were hung or became hung during the last
55 // monitoring window. This function should be invoked for each thread type
56 // encountered on each call to Monitor().
LogHungThreadCountHistogram(HangWatcher::ThreadType thread_type,int count)57 void LogHungThreadCountHistogram(HangWatcher::ThreadType thread_type,
58 int count) {
59 constexpr int kMaxHungThreadCount = 100;
60 switch (thread_type) {
61 case HangWatcher::ThreadType::kIOThread:
62 UMA_HISTOGRAM_EXACT_LINEAR(
63 "HangWatcher.NumberOfHungThreadsDuringWatchWindow.BrowserProcess."
64 "IOThread",
65 count, kMaxHungThreadCount);
66 break;
67 case HangWatcher::ThreadType::kUIThread:
68 UMA_HISTOGRAM_EXACT_LINEAR(
69 "HangWatcher.NumberOfHungThreadsDuringWatchWindow.BrowserProcess."
70 "UIThread",
71 count, kMaxHungThreadCount);
72 break;
73 case HangWatcher::ThreadType::kThreadPoolThread:
74 UMA_HISTOGRAM_EXACT_LINEAR(
75 "HangWatcher.NumberOfHungThreadsDuringWatchWindow.BrowserProcess."
76 "ThreadPool",
77 count, kMaxHungThreadCount);
78 break;
79 }
80 }
81 }
82
83 constexpr const char* kThreadName = "HangWatcher";
84
85 // The time that the HangWatcher thread will sleep for between calls to
86 // Monitor(). Increasing or decreasing this does not modify the type of hangs
87 // that can be detected. It instead increases the probability that a call to
88 // Monitor() will happen at the right time to catch a hang. This has to be
89 // balanced with power/cpu use concerns as busy looping would catch amost all
90 // hangs but present unacceptable overhead.
91 const base::TimeDelta kMonitoringPeriod = base::TimeDelta::FromSeconds(10);
92
HangWatchScopeEnabled(TimeDelta timeout)93 HangWatchScopeEnabled::HangWatchScopeEnabled(TimeDelta timeout) {
94 internal::HangWatchState* current_hang_watch_state =
95 internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
96
97 DCHECK(timeout >= base::TimeDelta()) << "Negative timeouts are invalid.";
98
99 // TODO(crbug.com/1034046): Remove when all threads using
100 // HangWatchScopeEnabled are monitored. Thread is not monitored, noop.
101 if (!current_hang_watch_state)
102 return;
103
104 DCHECK(current_hang_watch_state)
105 << "HangWatchScopeEnabled can only be used on a thread that "
106 "registered for hang watching with HangWatcher::RegisterThread.";
107
108 #if DCHECK_IS_ON()
109 previous_hang_watch_scope_enable_ =
110 current_hang_watch_state->GetCurrentHangWatchScopeEnabled();
111 current_hang_watch_state->SetCurrentHangWatchScopeEnabled(this);
112 #endif
113
114 uint64_t old_flags;
115 base::TimeTicks old_deadline;
116 std::tie(old_flags, old_deadline) =
117 current_hang_watch_state->GetFlagsAndDeadline();
118
119 const bool hangs_ignored_for_current_scope =
120 internal::HangWatchDeadline::IsFlagSet(
121 internal::HangWatchDeadline::Flag::
122 kIgnoreCurrentHangWatchScopeEnabled,
123 old_flags);
124
125 const bool has_active_hang_watch_disabled =
126 internal::HangWatchDeadline::IsFlagSet(
127 internal::HangWatchDeadline::Flag::kHasActiveHangWatchScopeDisabled,
128 old_flags);
129
130 // If the current HangWatchScopeEnabled is ignored but there are no active
131 // HangWatchScopeDisabled instances, temporarely reactivate hang watching for
132 // this newly created HangWatchScopeEnabled. On exiting hang watching is
133 // suspended again to return to the original state.
134 if (hangs_ignored_for_current_scope && !has_active_hang_watch_disabled) {
135 current_hang_watch_state->UnsetIgnoreCurrentHangWatchScopeEnabled();
136 set_hangs_ignored_on_exit_ = true;
137 }
138
139 // TODO(crbug.com/1034046): Check whether we are over deadline already for the
140 // previous HangWatchScopeEnabled here by issuing only one TimeTicks::Now()
141 // and resuing the value.
142
143 previous_deadline_ = old_deadline;
144 TimeTicks deadline = TimeTicks::Now() + timeout;
145 current_hang_watch_state->SetDeadline(deadline);
146 current_hang_watch_state->IncrementNestingLevel();
147 }
148
~HangWatchScopeEnabled()149 HangWatchScopeEnabled::~HangWatchScopeEnabled() {
150 DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
151
152 internal::HangWatchState* current_hang_watch_state =
153 internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
154
155 // TODO(crbug.com/1034046): Remove when all threads using
156 // HangWatchScopeEnabled are monitored. Thread is not monitored, noop.
157 if (!current_hang_watch_state) {
158 return;
159 }
160
161 // If a hang is currently being captured we should block here so execution
162 // stops and we avoid recording unrelated stack frames in the crash.
163 if (current_hang_watch_state->IsFlagSet(
164 internal::HangWatchDeadline::Flag::kShouldBlockOnHang))
165 base::HangWatcher::GetInstance()->BlockIfCaptureInProgress();
166
167 #if DCHECK_IS_ON()
168 // Verify that no Scope was destructed out of order.
169 DCHECK_EQ(this, current_hang_watch_state->GetCurrentHangWatchScopeEnabled());
170 current_hang_watch_state->SetCurrentHangWatchScopeEnabled(
171 previous_hang_watch_scope_enable_);
172 #endif
173
174 // If a HangWatchScopeDisabled suspended hang watching during the
175 // lifetime of this or any nested HangWatchScopeEnabled it can now safely be
176 // reactivated by clearing the ignore bit since this is the outer-most scope.
177 // See HangWatchScopeDisabled class comments where this represents the
178 // destruction of |scope_1|.
179 if (current_hang_watch_state->nesting_level() == 1)
180 current_hang_watch_state->UnsetIgnoreCurrentHangWatchScopeEnabled();
181 // Return to ignoring hangs since this was the previous state before hang
182 // watching was temporarily enabled for this HangWatchScopeEnabled only in the
183 // constructor. See HangWatchScopeDisabled class comments where the next line
184 // of code is part of the destruction of |scope_4|.
185 else if (set_hangs_ignored_on_exit_)
186 current_hang_watch_state->SetIgnoreCurrentHangWatchScopeEnabled();
187
188 // Reset the deadline to the value it had before entering this
189 // HangWatchScopeEnabled.
190 current_hang_watch_state->SetDeadline(previous_deadline_);
191 // TODO(crbug.com/1034046): Log when a HangWatchScopeEnabled exits after its
192 // deadline and that went undetected by the HangWatcher.
193
194 current_hang_watch_state->DecrementNestingLevel();
195 }
196
HangWatchScopeDisabled()197 HangWatchScopeDisabled::HangWatchScopeDisabled() {
198 internal::HangWatchState* current_hang_watch_state =
199 internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
200
201 if (!current_hang_watch_state) {
202 return;
203 }
204
205 current_hang_watch_state->SetIgnoreCurrentHangWatchScopeEnabled();
206 current_hang_watch_state->SetHasActiveHangWatchScopeDisabled();
207 }
208
~HangWatchScopeDisabled()209 HangWatchScopeDisabled::~HangWatchScopeDisabled() {
210 internal::HangWatchState* current_hang_watch_state =
211 internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
212
213 if (!current_hang_watch_state)
214 return;
215
216 // If this instance outlived all HangWatchScopeEnabled instances watching
217 // needs to be reactivated.
218 if (current_hang_watch_state->nesting_level() == 0)
219 current_hang_watch_state->UnsetIgnoreCurrentHangWatchScopeEnabled();
220
221 current_hang_watch_state->UnsetHasActiveHangWatchScopeDisabled();
222 }
223
224 // static
InitializeOnMainThread()225 void HangWatcher::InitializeOnMainThread() {
226 DCHECK(!g_use_hang_watcher);
227 DCHECK(!g_hang_watch_workers);
228 DCHECK(!g_hang_watch_io_thread);
229 DCHECK(!g_hang_watch_ui_thread);
230
231 g_use_hang_watcher.store(base::FeatureList::IsEnabled(kEnableHangWatcher),
232 std::memory_order_relaxed);
233
234 // If hang watching is disabled as a whole there is no need to read the
235 // params.
236 if (g_use_hang_watcher.load(std::memory_order_relaxed)) {
237 g_hang_watch_workers.store(kHangWatchThreadPool.Get(),
238 std::memory_order_relaxed);
239 g_hang_watch_io_thread.store(kHangWatchIOThread.Get(),
240 std::memory_order_relaxed);
241 g_hang_watch_ui_thread.store(kHangWatchUIThread.Get(),
242 std::memory_order_relaxed);
243 }
244 }
245
246 // static
IsEnabled()247 bool HangWatcher::IsEnabled() {
248 return g_use_hang_watcher.load(std::memory_order_relaxed);
249 }
250
251 // static
IsThreadPoolHangWatchingEnabled()252 bool HangWatcher::IsThreadPoolHangWatchingEnabled() {
253 return g_hang_watch_workers.load(std::memory_order_relaxed);
254 }
255
IsIOThreadHangWatchingEnabled()256 bool HangWatcher::IsIOThreadHangWatchingEnabled() {
257 return g_hang_watch_io_thread.load(std::memory_order_relaxed);
258 }
259
IsUIThreadHangWatchingEnabled()260 bool HangWatcher::IsUIThreadHangWatchingEnabled() {
261 return g_hang_watch_ui_thread.load(std::memory_order_relaxed);
262 }
263
HangWatcher()264 HangWatcher::HangWatcher()
265 : monitor_period_(kMonitoringPeriod),
266 should_monitor_(WaitableEvent::ResetPolicy::AUTOMATIC),
267 thread_(this, kThreadName),
268 tick_clock_(base::DefaultTickClock::GetInstance()),
269 memory_pressure_listener_(
270 FROM_HERE,
271 base::BindRepeating(&HangWatcher::OnMemoryPressure,
272 base::Unretained(this))) {
273 // |thread_checker_| should not be bound to the constructing thread.
274 DETACH_FROM_THREAD(hang_watcher_thread_checker_);
275
276 should_monitor_.declare_only_used_while_idle();
277
278 DCHECK(!g_instance);
279 g_instance = this;
280 }
281
282 #if not defined(OS_NACL)
283 debug::ScopedCrashKeyString
GetTimeSinceLastCriticalMemoryPressureCrashKey()284 HangWatcher::GetTimeSinceLastCriticalMemoryPressureCrashKey() {
285 DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_);
286
287 // The crash key size is large enough to hold the biggest possible return
288 // value from base::TimeDelta::InSeconds().
289 constexpr debug::CrashKeySize kCrashKeyContentSize =
290 debug::CrashKeySize::Size32;
291 DCHECK_GE(static_cast<uint64_t>(kCrashKeyContentSize),
292 base::NumberToString(std::numeric_limits<int64_t>::max()).size());
293
294 static debug::CrashKeyString* crash_key = AllocateCrashKeyString(
295 "seconds-since-last-memory-pressure", kCrashKeyContentSize);
296
297 const base::TimeTicks last_critical_memory_pressure_time =
298 last_critical_memory_pressure_.load(std::memory_order_relaxed);
299 if (last_critical_memory_pressure_time.is_null()) {
300 constexpr char kNoMemoryPressureMsg[] = "No critical memory pressure";
301 static_assert(
302 base::size(kNoMemoryPressureMsg) <=
303 static_cast<uint64_t>(kCrashKeyContentSize),
304 "The crash key is too small to hold \"No critical memory pressure\".");
305 return debug::ScopedCrashKeyString(crash_key, kNoMemoryPressureMsg);
306 } else {
307 base::TimeDelta time_since_last_critical_memory_pressure =
308 base::TimeTicks::Now() - last_critical_memory_pressure_time;
309 return debug::ScopedCrashKeyString(
310 crash_key, base::NumberToString(
311 time_since_last_critical_memory_pressure.InSeconds()));
312 }
313 }
314 #endif
315
OnMemoryPressure(base::MemoryPressureListener::MemoryPressureLevel memory_pressure_level)316 void HangWatcher::OnMemoryPressure(
317 base::MemoryPressureListener::MemoryPressureLevel memory_pressure_level) {
318 if (memory_pressure_level ==
319 base::MemoryPressureListener::MEMORY_PRESSURE_LEVEL_CRITICAL) {
320 last_critical_memory_pressure_.store(base::TimeTicks::Now(),
321 std::memory_order_relaxed);
322 }
323 }
324
~HangWatcher()325 HangWatcher::~HangWatcher() {
326 DCHECK_EQ(g_instance, this);
327 DCHECK(watch_states_.empty());
328 g_instance = nullptr;
329 Stop();
330 }
331
Start()332 void HangWatcher::Start() {
333 thread_.Start();
334 }
335
Stop()336 void HangWatcher::Stop() {
337 keep_monitoring_.store(false, std::memory_order_relaxed);
338 should_monitor_.Signal();
339 thread_.Join();
340 }
341
IsWatchListEmpty()342 bool HangWatcher::IsWatchListEmpty() {
343 AutoLock auto_lock(watch_state_lock_);
344 return watch_states_.empty();
345 }
346
Wait()347 void HangWatcher::Wait() {
348 while (true) {
349 // Amount by which the actual time spent sleeping can deviate from
350 // the target time and still be considered timely.
351 constexpr base::TimeDelta kWaitDriftTolerance =
352 base::TimeDelta::FromMilliseconds(100);
353
354 const base::TimeTicks time_before_wait = tick_clock_->NowTicks();
355
356 // Sleep until next scheduled monitoring or until signaled.
357 const bool was_signaled = should_monitor_.TimedWait(monitor_period_);
358
359 if (after_wait_callback_)
360 after_wait_callback_.Run(time_before_wait);
361
362 const base::TimeTicks time_after_wait = tick_clock_->NowTicks();
363 const base::TimeDelta wait_time = time_after_wait - time_before_wait;
364 const bool wait_was_normal =
365 wait_time <= (monitor_period_ + kWaitDriftTolerance);
366
367 UMA_HISTOGRAM_TIMES("HangWatcher.SleepDrift.BrowserProcess",
368 wait_time - monitor_period_);
369
370 if (!wait_was_normal) {
371 // If the time spent waiting was too high it might indicate the machine is
372 // very slow or that that it went to sleep. In any case we can't trust the
373 // HangWatchScopeEnableds that are currently live. Update the ignore
374 // threshold to make sure they don't trigger a hang on subsequent monitors
375 // then keep waiting.
376
377 base::AutoLock auto_lock(watch_state_lock_);
378
379 // Find the latest deadline among the live watch states. They might change
380 // atomically while iterating but that's fine because if they do that
381 // means the new HangWatchScopeEnabled was constructed very soon after the
382 // abnormal sleep happened and might be affected by the root cause still.
383 // Ignoring it is cautious and harmless.
384 base::TimeTicks latest_deadline;
385 for (const auto& state : watch_states_) {
386 base::TimeTicks deadline = state->GetDeadline();
387 if (deadline > latest_deadline) {
388 latest_deadline = deadline;
389 }
390 }
391
392 deadline_ignore_threshold_ = latest_deadline;
393 }
394
395 // Stop waiting.
396 if (wait_was_normal || was_signaled)
397 return;
398 }
399 }
400
Run()401 void HangWatcher::Run() {
402 // Monitor() should only run on |thread_|. Bind |thread_checker_| here to make
403 // sure of that.
404 DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_);
405
406 while (keep_monitoring_.load(std::memory_order_relaxed)) {
407 Wait();
408
409 if (!IsWatchListEmpty() &&
410 keep_monitoring_.load(std::memory_order_relaxed)) {
411 Monitor();
412 if (after_monitor_closure_for_testing_) {
413 after_monitor_closure_for_testing_.Run();
414 }
415 }
416 }
417 }
418
419 // static
GetInstance()420 HangWatcher* HangWatcher::GetInstance() {
421 return g_instance;
422 }
423
424 // static
RecordHang()425 void HangWatcher::RecordHang() {
426 base::debug::DumpWithoutCrashing();
427 // Inhibit code folding.
428 const int line_number = __LINE__;
429 base::debug::Alias(&line_number);
430 }
431
RegisterThread(ThreadType thread_type)432 ScopedClosureRunner HangWatcher::RegisterThread(ThreadType thread_type) {
433 AutoLock auto_lock(watch_state_lock_);
434
435 watch_states_.push_back(
436 internal::HangWatchState::CreateHangWatchStateForCurrentThread(
437 thread_type));
438
439 return ScopedClosureRunner(BindOnce(&HangWatcher::UnregisterThread,
440 Unretained(HangWatcher::GetInstance())));
441 }
442
GetHighestDeadline() const443 base::TimeTicks HangWatcher::WatchStateSnapShot::GetHighestDeadline() const {
444 DCHECK(!hung_watch_state_copies_.empty());
445 // Since entries are sorted in increasing order the last entry is the largest
446 // one.
447 return hung_watch_state_copies_.back().deadline;
448 }
449
WatchStateSnapShot(const HangWatchStates & watch_states,base::TimeTicks deadline_ignore_threshold)450 HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
451 const HangWatchStates& watch_states,
452 base::TimeTicks deadline_ignore_threshold) {
453 const base::TimeTicks now = base::TimeTicks::Now();
454 bool all_threads_marked = true;
455 bool found_deadline_before_ignore_threshold = false;
456
457 // Use an std::array to store the hang counts to avoid allocations. The
458 // numerical values of the HangWatcher::ThreadType enum is used to index into
459 // the array. A |kInvalidHangCount| is used to signify there were no threads
460 // of the type found.
461 constexpr size_t kHangCountArraySize =
462 static_cast<std::size_t>(base::HangWatcher::ThreadType::kMax) + 1;
463 std::array<int, kHangCountArraySize> hung_counts_per_thread_type;
464
465 constexpr int kInvalidHangCount = -1;
466 hung_counts_per_thread_type.fill(kInvalidHangCount);
467
468 // Copy hung thread information.
469 for (const auto& watch_state : watch_states) {
470 uint64_t flags;
471 base::TimeTicks deadline;
472 std::tie(flags, deadline) = watch_state->GetFlagsAndDeadline();
473
474 if (deadline <= deadline_ignore_threshold) {
475 found_deadline_before_ignore_threshold = true;
476 }
477
478 if (internal::HangWatchDeadline::IsFlagSet(
479 internal::HangWatchDeadline::Flag::
480 kIgnoreCurrentHangWatchScopeEnabled,
481 flags)) {
482 continue;
483 }
484
485 // If a thread type is monitored and did not hang it still needs to be
486 // logged as a zero count;
487 const size_t hang_count_index =
488 static_cast<size_t>(watch_state.get()->thread_type());
489 if (hung_counts_per_thread_type[hang_count_index] == kInvalidHangCount) {
490 hung_counts_per_thread_type[hang_count_index] = 0;
491 }
492
493 // Only copy hung threads.
494 if (deadline <= now) {
495 ++hung_counts_per_thread_type[hang_count_index];
496
497 // Attempt to mark the thread as needing to stay within its current
498 // HangWatchScopeEnabled until capture is complete.
499 bool thread_marked = watch_state->SetShouldBlockOnHang(flags, deadline);
500
501 // If marking some threads already failed the snapshot won't be kept so
502 // there is no need to keep adding to it. The loop doesn't abort though
503 // to keep marking the other threads. If these threads remain hung until
504 // the next capture then they'll already be marked and will be included
505 // in the capture at that time.
506 if (thread_marked && all_threads_marked) {
507 hung_watch_state_copies_.push_back(
508 WatchStateCopy{deadline, watch_state.get()->GetThreadID()});
509 } else {
510 all_threads_marked = false;
511 }
512 }
513 }
514
515 // Log the hung thread counts to histograms for each thread type if any thread
516 // of the type were found.
517 for (size_t i = 0; i < kHangCountArraySize; ++i) {
518 const int hang_count = hung_counts_per_thread_type[i];
519 if (hang_count != kInvalidHangCount)
520 LogHungThreadCountHistogram(static_cast<HangWatcher::ThreadType>(i),
521 hang_count);
522 }
523
524 // Two cases can invalidate this snapshot and prevent the capture of the hang.
525 //
526 // 1. Some threads could not be marked for blocking so this snapshot isn't
527 // actionable since marked threads could be hung because of unmarked ones.
528 // If only the marked threads were captured the information would be
529 // incomplete.
530 //
531 // 2. Any of the threads have a deadline before |deadline_ignore_threshold|.
532 // If any thread is ignored it reduces the confidence in the whole state and
533 // it's better to avoid capturing misleading data.
534 if (!all_threads_marked || found_deadline_before_ignore_threshold) {
535 hung_watch_state_copies_.clear();
536 return;
537 }
538
539 // Sort |hung_watch_state_copies_| by order of decreasing hang severity so the
540 // most severe hang is first in the list.
541 ranges::sort(hung_watch_state_copies_,
542 [](const WatchStateCopy& lhs, const WatchStateCopy& rhs) {
543 return lhs.deadline < rhs.deadline;
544 });
545 }
546
547 HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
548 const WatchStateSnapShot& other) = default;
549
550 HangWatcher::WatchStateSnapShot::~WatchStateSnapShot() = default;
551
PrepareHungThreadListCrashKey() const552 std::string HangWatcher::WatchStateSnapShot::PrepareHungThreadListCrashKey()
553 const {
554 DCHECK(IsActionable());
555
556 // Build a crash key string that contains the ids of the hung threads.
557 constexpr char kSeparator{'|'};
558 std::string list_of_hung_thread_ids;
559
560 // Add as many thread ids to the crash key as possible.
561 for (const WatchStateCopy& copy : hung_watch_state_copies_) {
562 std::string fragment = base::NumberToString(copy.thread_id) + kSeparator;
563 if (list_of_hung_thread_ids.size() + fragment.size() <
564 static_cast<std::size_t>(debug::CrashKeySize::Size256)) {
565 list_of_hung_thread_ids += fragment;
566 } else {
567 // Respect the by priority ordering of thread ids in the crash key by
568 // stopping the construction as soon as one does not fit. This avoids
569 // including lesser priority ids while omitting more important ones.
570 break;
571 }
572 }
573
574 return list_of_hung_thread_ids;
575 }
576
IsActionable() const577 bool HangWatcher::WatchStateSnapShot::IsActionable() const {
578 return !hung_watch_state_copies_.empty();
579 }
580
GrabWatchStateSnapshotForTesting() const581 HangWatcher::WatchStateSnapShot HangWatcher::GrabWatchStateSnapshotForTesting()
582 const {
583 WatchStateSnapShot snapshot(watch_states_, deadline_ignore_threshold_);
584 return snapshot;
585 }
586
Monitor()587 void HangWatcher::Monitor() {
588 DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_);
589 AutoLock auto_lock(watch_state_lock_);
590
591 // If all threads unregistered since this function was invoked there's
592 // nothing to do anymore.
593 if (watch_states_.empty())
594 return;
595
596 WatchStateSnapShot watch_state_snapshot(watch_states_,
597 deadline_ignore_threshold_);
598
599 if (watch_state_snapshot.IsActionable()) {
600 DoDumpWithoutCrashing(watch_state_snapshot);
601 }
602 }
603
DoDumpWithoutCrashing(const WatchStateSnapShot & watch_state_snapshot)604 void HangWatcher::DoDumpWithoutCrashing(
605 const WatchStateSnapShot& watch_state_snapshot) {
606 capture_in_progress_.store(true, std::memory_order_relaxed);
607 base::AutoLock scope_lock(capture_lock_);
608
609 #if not defined(OS_NACL)
610 const std::string list_of_hung_thread_ids =
611 watch_state_snapshot.PrepareHungThreadListCrashKey();
612
613 static debug::CrashKeyString* crash_key = AllocateCrashKeyString(
614 "list-of-hung-threads", debug::CrashKeySize::Size256);
615
616 const debug::ScopedCrashKeyString list_of_hung_threads_crash_key_string(
617 crash_key, list_of_hung_thread_ids);
618
619 const debug::ScopedCrashKeyString
620 time_since_last_critical_memory_pressure_crash_key_string =
621 GetTimeSinceLastCriticalMemoryPressureCrashKey();
622 #endif
623
624 // To avoid capturing more than one hang that blames a subset of the same
625 // threads it's necessary to keep track of what is the furthest deadline
626 // that contributed to declaring a hang. Only once
627 // all threads have deadlines past this point can we be sure that a newly
628 // discovered hang is not directly related.
629 // Example:
630 // **********************************************************************
631 // Timeline A : L------1-------2----------3-------4----------N-----------
632 // Timeline B : -------2----------3-------4----------L----5------N-------
633 // Timeline C : L----------------------------5------6----7---8------9---N
634 // **********************************************************************
635 // In the example when a Monitor() happens during timeline A
636 // |deadline_ignore_threshold_| (L) is at time zero and deadlines (1-4)
637 // are before Now() (N) . A hang is captured and L is updated. During
638 // the next Monitor() (timeline B) a new deadline is over but we can't
639 // capture a hang because deadlines 2-4 are still live and already counted
640 // toward a hang. During a third monitor (timeline C) all live deadlines
641 // are now after L and a second hang can be recorded.
642 base::TimeTicks latest_expired_deadline =
643 watch_state_snapshot.GetHighestDeadline();
644
645 if (on_hang_closure_for_testing_)
646 on_hang_closure_for_testing_.Run();
647 else
648 RecordHang();
649
650 // Update after running the actual capture.
651 deadline_ignore_threshold_ = latest_expired_deadline;
652
653 capture_in_progress_.store(false, std::memory_order_relaxed);
654 }
655
SetAfterMonitorClosureForTesting(base::RepeatingClosure closure)656 void HangWatcher::SetAfterMonitorClosureForTesting(
657 base::RepeatingClosure closure) {
658 DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
659 after_monitor_closure_for_testing_ = std::move(closure);
660 }
661
SetOnHangClosureForTesting(base::RepeatingClosure closure)662 void HangWatcher::SetOnHangClosureForTesting(base::RepeatingClosure closure) {
663 DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
664 on_hang_closure_for_testing_ = std::move(closure);
665 }
666
SetMonitoringPeriodForTesting(base::TimeDelta period)667 void HangWatcher::SetMonitoringPeriodForTesting(base::TimeDelta period) {
668 DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
669 monitor_period_ = period;
670 }
671
SetAfterWaitCallbackForTesting(RepeatingCallback<void (TimeTicks)> callback)672 void HangWatcher::SetAfterWaitCallbackForTesting(
673 RepeatingCallback<void(TimeTicks)> callback) {
674 DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
675 after_wait_callback_ = callback;
676 }
677
SignalMonitorEventForTesting()678 void HangWatcher::SignalMonitorEventForTesting() {
679 DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
680 should_monitor_.Signal();
681 }
682
StopMonitoringForTesting()683 void HangWatcher::StopMonitoringForTesting() {
684 keep_monitoring_.store(false, std::memory_order_relaxed);
685 }
686
SetTickClockForTesting(const base::TickClock * tick_clock)687 void HangWatcher::SetTickClockForTesting(const base::TickClock* tick_clock) {
688 tick_clock_ = tick_clock;
689 }
690
BlockIfCaptureInProgress()691 void HangWatcher::BlockIfCaptureInProgress() {
692 // Makes a best-effort attempt to block execution if a hang is currently being
693 // captured.Only block on |capture_lock| if |capture_in_progress_| hints that
694 // it's already held to avoid serializing all threads on this function when no
695 // hang capture is in-progress.
696 if (capture_in_progress_.load(std::memory_order_relaxed))
697 base::AutoLock hang_lock(capture_lock_);
698 }
699
UnregisterThread()700 void HangWatcher::UnregisterThread() {
701 AutoLock auto_lock(watch_state_lock_);
702
703 internal::HangWatchState* current_hang_watch_state =
704 internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
705
706 auto it = ranges::find_if(
707 watch_states_,
708 [current_hang_watch_state](
709 const std::unique_ptr<internal::HangWatchState>& state) {
710 return state.get() == current_hang_watch_state;
711 });
712
713 // Thread should be registered to get unregistered.
714 DCHECK(it != watch_states_.end());
715
716 watch_states_.erase(it);
717 }
718
719 namespace internal {
720 namespace {
721
722 constexpr uint64_t kOnlyDeadlineMask = 0x00FFFFFFFFFFFFFFu;
723 constexpr uint64_t kOnlyFlagsMask = ~kOnlyDeadlineMask;
724 constexpr uint64_t kMaximumFlag = 0x8000000000000000u;
725
726 // Use as a mask to keep persistent flags and the deadline.
727 constexpr uint64_t kPersistentFlagsAndDeadlineMask =
728 kOnlyDeadlineMask |
729 static_cast<uint64_t>(
730 HangWatchDeadline::Flag::kIgnoreCurrentHangWatchScopeEnabled) |
731 static_cast<uint64_t>(
732 HangWatchDeadline::Flag::kHasActiveHangWatchScopeDisabled);
733 } // namespace
734
735 // Flag binary representation assertions.
736 static_assert(
737 static_cast<uint64_t>(HangWatchDeadline::Flag::kMinValue) >
738 kOnlyDeadlineMask,
739 "Invalid numerical value for flag. Would interfere with bits of data.");
740 static_assert(static_cast<uint64_t>(HangWatchDeadline::Flag::kMaxValue) <=
741 kMaximumFlag,
742 "A flag can only set a single bit.");
743
744 HangWatchDeadline::HangWatchDeadline() = default;
745 HangWatchDeadline::~HangWatchDeadline() = default;
746
GetFlagsAndDeadline() const747 std::pair<uint64_t, TimeTicks> HangWatchDeadline::GetFlagsAndDeadline() const {
748 uint64_t bits = bits_.load(std::memory_order_relaxed);
749 return std::make_pair(ExtractFlags(bits),
750 DeadlineFromBits(ExtractDeadline((bits))));
751 }
752
GetDeadline() const753 TimeTicks HangWatchDeadline::GetDeadline() const {
754 return DeadlineFromBits(
755 ExtractDeadline(bits_.load(std::memory_order_relaxed)));
756 }
757
758 // static
Max()759 TimeTicks HangWatchDeadline::Max() {
760 // |kOnlyDeadlineMask| has all the bits reserved for the TimeTicks value
761 // set. This means it also represents the highest representable value.
762 return DeadlineFromBits(kOnlyDeadlineMask);
763 }
764
765 // static
IsFlagSet(Flag flag,uint64_t flags)766 bool HangWatchDeadline::IsFlagSet(Flag flag, uint64_t flags) {
767 return static_cast<uint64_t>(flag) & flags;
768 }
769
SetDeadline(TimeTicks new_deadline)770 void HangWatchDeadline::SetDeadline(TimeTicks new_deadline) {
771 DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
772 DCHECK(new_deadline <= Max()) << "Value too high to be represented.";
773 DCHECK(new_deadline >= TimeTicks{}) << "Value cannot be negative.";
774
775 if (switch_bits_callback_for_testing_) {
776 const uint64_t switched_in_bits = SwitchBitsForTesting();
777 // If a concurrent deadline change is tested it cannot have a deadline or
778 // persistent flag change since those always happen on the same thread.
779 DCHECK((switched_in_bits & kPersistentFlagsAndDeadlineMask) == 0u);
780 }
781
782 // Discard all non-persistent flags and apply deadline change.
783 const uint64_t old_bits = bits_.load(std::memory_order_relaxed);
784 const uint64_t new_flags =
785 ExtractFlags(old_bits & kPersistentFlagsAndDeadlineMask);
786 bits_.store(new_flags | ExtractDeadline(new_deadline.ToInternalValue()),
787 std::memory_order_relaxed);
788 }
789
790 // TODO(crbug.com/1087026): Add flag DCHECKs here.
SetShouldBlockOnHang(uint64_t old_flags,TimeTicks old_deadline)791 bool HangWatchDeadline::SetShouldBlockOnHang(uint64_t old_flags,
792 TimeTicks old_deadline) {
793 DCHECK(old_deadline <= Max()) << "Value too high to be represented.";
794 DCHECK(old_deadline >= TimeTicks{}) << "Value cannot be negative.";
795
796 // Set the kShouldBlockOnHang flag only if |bits_| did not change since it was
797 // read. kShouldBlockOnHang is the only non-persistent flag and should never
798 // be set twice. Persistent flags and deadline changes are done from the same
799 // thread so there is no risk of losing concurrently added information.
800 uint64_t old_bits =
801 old_flags | static_cast<uint64_t>(old_deadline.ToInternalValue());
802 const uint64_t desired_bits =
803 old_bits | static_cast<uint64_t>(Flag::kShouldBlockOnHang);
804
805 // If a test needs to simulate |bits_| changing since calling this function
806 // this happens now.
807 if (switch_bits_callback_for_testing_) {
808 const uint64_t switched_in_bits = SwitchBitsForTesting();
809
810 // Injecting the flag being tested is invalid.
811 DCHECK(!IsFlagSet(Flag::kShouldBlockOnHang, switched_in_bits));
812 }
813
814 return bits_.compare_exchange_weak(old_bits, desired_bits,
815 std::memory_order_relaxed,
816 std::memory_order_relaxed);
817 }
818
SetHasActiveHangWatchScopeDisabled()819 void HangWatchDeadline::SetHasActiveHangWatchScopeDisabled() {
820 SetPersistentFlag(Flag::kHasActiveHangWatchScopeDisabled);
821 }
822
UnsetHasActiveHangWatchScopeDisabled()823 void HangWatchDeadline::UnsetHasActiveHangWatchScopeDisabled() {
824 ClearPersistentFlag(Flag::kHasActiveHangWatchScopeDisabled);
825 }
826
SetIgnoreCurrentHangWatchScopeEnabled()827 void HangWatchDeadline::SetIgnoreCurrentHangWatchScopeEnabled() {
828 SetPersistentFlag(Flag::kIgnoreCurrentHangWatchScopeEnabled);
829 }
830
UnsetIgnoreCurrentHangWatchScopeEnabled()831 void HangWatchDeadline::UnsetIgnoreCurrentHangWatchScopeEnabled() {
832 ClearPersistentFlag(Flag::kIgnoreCurrentHangWatchScopeEnabled);
833 }
834
SetPersistentFlag(Flag flag)835 void HangWatchDeadline::SetPersistentFlag(Flag flag) {
836 DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
837 if (switch_bits_callback_for_testing_)
838 SwitchBitsForTesting();
839 bits_.fetch_or(static_cast<uint64_t>(flag), std::memory_order_relaxed);
840 }
841
ClearPersistentFlag(Flag flag)842 void HangWatchDeadline::ClearPersistentFlag(Flag flag) {
843 DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
844 if (switch_bits_callback_for_testing_)
845 SwitchBitsForTesting();
846 bits_.fetch_and(~(static_cast<uint64_t>(flag)), std::memory_order_relaxed);
847 }
848
849 // static
ExtractFlags(uint64_t bits)850 uint64_t HangWatchDeadline::ExtractFlags(uint64_t bits) {
851 return bits & kOnlyFlagsMask;
852 }
853
854 // static
ExtractDeadline(uint64_t bits)855 uint64_t HangWatchDeadline::ExtractDeadline(uint64_t bits) {
856 return bits & kOnlyDeadlineMask;
857 }
858
859 // static
DeadlineFromBits(uint64_t bits)860 TimeTicks HangWatchDeadline::DeadlineFromBits(uint64_t bits) {
861 // |kOnlyDeadlineMask| has all the deadline bits set to 1 so is the largest
862 // representable value.
863 DCHECK(bits <= kOnlyDeadlineMask)
864 << "Flags bits are set. Remove them before returning deadline.";
865 return TimeTicks::FromInternalValue(bits);
866 }
867
IsFlagSet(Flag flag) const868 bool HangWatchDeadline::IsFlagSet(Flag flag) const {
869 return bits_.load(std::memory_order_relaxed) & static_cast<uint64_t>(flag);
870 }
871
SetSwitchBitsClosureForTesting(RepeatingCallback<uint64_t (void)> closure)872 void HangWatchDeadline::SetSwitchBitsClosureForTesting(
873 RepeatingCallback<uint64_t(void)> closure) {
874 switch_bits_callback_for_testing_ = closure;
875 }
876
ResetSwitchBitsClosureForTesting()877 void HangWatchDeadline::ResetSwitchBitsClosureForTesting() {
878 DCHECK(switch_bits_callback_for_testing_);
879 switch_bits_callback_for_testing_.Reset();
880 }
881
SwitchBitsForTesting()882 uint64_t HangWatchDeadline::SwitchBitsForTesting() {
883 DCHECK(switch_bits_callback_for_testing_);
884
885 const uint64_t old_bits = bits_.load(std::memory_order_relaxed);
886 const uint64_t new_bits = switch_bits_callback_for_testing_.Run();
887 const uint64_t old_flags = ExtractFlags(old_bits);
888
889 const uint64_t switched_in_bits = old_flags | new_bits;
890 bits_.store(switched_in_bits, std::memory_order_relaxed);
891 return switched_in_bits;
892 }
893
HangWatchState(HangWatcher::ThreadType thread_type)894 HangWatchState::HangWatchState(HangWatcher::ThreadType thread_type)
895 : thread_id_(PlatformThread::CurrentId()), thread_type_(thread_type) {
896 // There should not exist a state object for this thread already.
897 DCHECK(!GetHangWatchStateForCurrentThread()->Get());
898
899 // Bind the new instance to this thread.
900 GetHangWatchStateForCurrentThread()->Set(this);
901 }
902
~HangWatchState()903 HangWatchState::~HangWatchState() {
904 DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
905
906 DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), this);
907 GetHangWatchStateForCurrentThread()->Set(nullptr);
908
909 #if DCHECK_IS_ON()
910 // Destroying the HangWatchState should not be done if there are live
911 // HangWatchScopeEnableds.
912 DCHECK(!current_hang_watch_scope_enable_);
913 #endif
914 }
915
916 // static
917 std::unique_ptr<HangWatchState>
CreateHangWatchStateForCurrentThread(HangWatcher::ThreadType thread_type)918 HangWatchState::CreateHangWatchStateForCurrentThread(
919 HangWatcher::ThreadType thread_type) {
920 // Allocate a watch state object for this thread.
921 std::unique_ptr<HangWatchState> hang_state =
922 std::make_unique<HangWatchState>(thread_type);
923
924 // Setting the thread local worked.
925 DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), hang_state.get());
926
927 // Transfer ownership to caller.
928 return hang_state;
929 }
930
GetDeadline() const931 TimeTicks HangWatchState::GetDeadline() const {
932 return deadline_.GetDeadline();
933 }
934
GetFlagsAndDeadline() const935 std::pair<uint64_t, TimeTicks> HangWatchState::GetFlagsAndDeadline() const {
936 return deadline_.GetFlagsAndDeadline();
937 }
938
SetDeadline(TimeTicks deadline)939 void HangWatchState::SetDeadline(TimeTicks deadline) {
940 DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
941 deadline_.SetDeadline(deadline);
942 }
943
IsOverDeadline() const944 bool HangWatchState::IsOverDeadline() const {
945 return TimeTicks::Now() > deadline_.GetDeadline();
946 }
947
SetHasActiveHangWatchScopeDisabled()948 void HangWatchState::SetHasActiveHangWatchScopeDisabled() {
949 deadline_.SetHasActiveHangWatchScopeDisabled();
950 }
951
UnsetHasActiveHangWatchScopeDisabled()952 void HangWatchState::UnsetHasActiveHangWatchScopeDisabled() {
953 deadline_.UnsetHasActiveHangWatchScopeDisabled();
954 }
955
SetIgnoreCurrentHangWatchScopeEnabled()956 void HangWatchState::SetIgnoreCurrentHangWatchScopeEnabled() {
957 deadline_.SetIgnoreCurrentHangWatchScopeEnabled();
958 }
959
UnsetIgnoreCurrentHangWatchScopeEnabled()960 void HangWatchState::UnsetIgnoreCurrentHangWatchScopeEnabled() {
961 deadline_.UnsetIgnoreCurrentHangWatchScopeEnabled();
962 }
963
SetShouldBlockOnHang(uint64_t old_flags,TimeTicks old_deadline)964 bool HangWatchState::SetShouldBlockOnHang(uint64_t old_flags,
965 TimeTicks old_deadline) {
966 return deadline_.SetShouldBlockOnHang(old_flags, old_deadline);
967 }
968
IsFlagSet(HangWatchDeadline::Flag flag)969 bool HangWatchState::IsFlagSet(HangWatchDeadline::Flag flag) {
970 return deadline_.IsFlagSet(flag);
971 }
972
973 #if DCHECK_IS_ON()
SetCurrentHangWatchScopeEnabled(HangWatchScopeEnabled * current_hang_watch_scope_enable)974 void HangWatchState::SetCurrentHangWatchScopeEnabled(
975 HangWatchScopeEnabled* current_hang_watch_scope_enable) {
976 DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
977 current_hang_watch_scope_enable_ = current_hang_watch_scope_enable;
978 }
979
GetCurrentHangWatchScopeEnabled()980 HangWatchScopeEnabled* HangWatchState::GetCurrentHangWatchScopeEnabled() {
981 DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
982 return current_hang_watch_scope_enable_;
983 }
984 #endif
985
GetHangWatchDeadlineForTesting()986 HangWatchDeadline* HangWatchState::GetHangWatchDeadlineForTesting() {
987 return &deadline_;
988 }
989
IncrementNestingLevel()990 void HangWatchState::IncrementNestingLevel() {
991 ++nesting_level_;
992 }
993
DecrementNestingLevel()994 void HangWatchState::DecrementNestingLevel() {
995 --nesting_level_;
996 }
997
998 // static
999 ThreadLocalPointer<HangWatchState>*
GetHangWatchStateForCurrentThread()1000 HangWatchState::GetHangWatchStateForCurrentThread() {
1001 static NoDestructor<ThreadLocalPointer<HangWatchState>> hang_watch_state;
1002 return hang_watch_state.get();
1003 }
1004
GetThreadID() const1005 PlatformThreadId HangWatchState::GetThreadID() const {
1006 return thread_id_;
1007 }
1008
1009 } // namespace internal
1010
1011 } // namespace base
1012