1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/threading/hang_watcher.h"
6 
7 #include <atomic>
8 #include <utility>
9 
10 #include "base/bind.h"
11 #include "base/callback_helpers.h"
12 #include "base/containers/flat_map.h"
13 #include "base/debug/alias.h"
14 #include "base/debug/crash_logging.h"
15 #include "base/debug/dump_without_crashing.h"
16 #include "base/feature_list.h"
17 #include "base/metrics/field_trial_params.h"
18 #include "base/metrics/histogram_macros.h"
19 #include "base/no_destructor.h"
20 #include "base/ranges/algorithm.h"
21 #include "base/strings/string_number_conversions.h"
22 #include "base/synchronization/lock.h"
23 #include "base/synchronization/waitable_event.h"
24 #include "base/threading/platform_thread.h"
25 #include "base/threading/thread_checker.h"
26 #include "base/threading/thread_restrictions.h"
27 #include "base/time/default_tick_clock.h"
28 #include "base/time/time.h"
29 #include "build/build_config.h"
30 
31 namespace base {
32 
33 // static
34 constexpr base::Feature kEnableHangWatcher{"EnableHangWatcher",
35                                            base::FEATURE_DISABLED_BY_DEFAULT};
36 constexpr base::FeatureParam<bool> kHangWatchIOThread{
37     &kEnableHangWatcher, "hang_watch_io_thread", false};
38 constexpr base::FeatureParam<bool> kHangWatchUIThread{
39     &kEnableHangWatcher, "hang_watch_ui_thread", false};
40 constexpr base::FeatureParam<bool> kHangWatchThreadPool{
41     &kEnableHangWatcher, "hang_watch_threadpool", false};
42 
43 constexpr base::TimeDelta HangWatchScopeEnabled::kDefaultHangWatchTime =
44     base::TimeDelta::FromSeconds(10);
45 
46 namespace {
47 HangWatcher* g_instance = nullptr;
48 std::atomic<bool> g_use_hang_watcher{false};
49 std::atomic<bool> g_hang_watch_workers{false};
50 std::atomic<bool> g_hang_watch_io_thread{false};
51 std::atomic<bool> g_hang_watch_ui_thread{false};
52 
53 // Emits the hung thread count histogram. |count| is the number of threads
54 // of type |thread_type| that were hung or became hung during the last
55 // monitoring window. This function should be invoked for each thread type
56 // encountered on each call to Monitor().
LogHungThreadCountHistogram(HangWatcher::ThreadType thread_type,int count)57 void LogHungThreadCountHistogram(HangWatcher::ThreadType thread_type,
58                                  int count) {
59   constexpr int kMaxHungThreadCount = 100;
60   switch (thread_type) {
61     case HangWatcher::ThreadType::kIOThread:
62       UMA_HISTOGRAM_EXACT_LINEAR(
63           "HangWatcher.NumberOfHungThreadsDuringWatchWindow.BrowserProcess."
64           "IOThread",
65           count, kMaxHungThreadCount);
66       break;
67     case HangWatcher::ThreadType::kUIThread:
68       UMA_HISTOGRAM_EXACT_LINEAR(
69           "HangWatcher.NumberOfHungThreadsDuringWatchWindow.BrowserProcess."
70           "UIThread",
71           count, kMaxHungThreadCount);
72       break;
73     case HangWatcher::ThreadType::kThreadPoolThread:
74       UMA_HISTOGRAM_EXACT_LINEAR(
75           "HangWatcher.NumberOfHungThreadsDuringWatchWindow.BrowserProcess."
76           "ThreadPool",
77           count, kMaxHungThreadCount);
78       break;
79   }
80 }
81 }
82 
83 constexpr const char* kThreadName = "HangWatcher";
84 
85 // The time that the HangWatcher thread will sleep for between calls to
86 // Monitor(). Increasing or decreasing this does not modify the type of hangs
87 // that can be detected. It instead increases the probability that a call to
88 // Monitor() will happen at the right time to catch a hang. This has to be
89 // balanced with power/cpu use concerns as busy looping would catch amost all
90 // hangs but present unacceptable overhead.
91 const base::TimeDelta kMonitoringPeriod = base::TimeDelta::FromSeconds(10);
92 
HangWatchScopeEnabled(TimeDelta timeout)93 HangWatchScopeEnabled::HangWatchScopeEnabled(TimeDelta timeout) {
94   internal::HangWatchState* current_hang_watch_state =
95       internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
96 
97   DCHECK(timeout >= base::TimeDelta()) << "Negative timeouts are invalid.";
98 
99   // TODO(crbug.com/1034046): Remove when all threads using
100   // HangWatchScopeEnabled are monitored. Thread is not monitored, noop.
101   if (!current_hang_watch_state)
102     return;
103 
104   DCHECK(current_hang_watch_state)
105       << "HangWatchScopeEnabled can only be used on a thread that "
106          "registered for hang watching with HangWatcher::RegisterThread.";
107 
108 #if DCHECK_IS_ON()
109   previous_hang_watch_scope_enable_ =
110       current_hang_watch_state->GetCurrentHangWatchScopeEnabled();
111   current_hang_watch_state->SetCurrentHangWatchScopeEnabled(this);
112 #endif
113 
114   uint64_t old_flags;
115   base::TimeTicks old_deadline;
116   std::tie(old_flags, old_deadline) =
117       current_hang_watch_state->GetFlagsAndDeadline();
118 
119   const bool hangs_ignored_for_current_scope =
120       internal::HangWatchDeadline::IsFlagSet(
121           internal::HangWatchDeadline::Flag::
122               kIgnoreCurrentHangWatchScopeEnabled,
123           old_flags);
124 
125   const bool has_active_hang_watch_disabled =
126       internal::HangWatchDeadline::IsFlagSet(
127           internal::HangWatchDeadline::Flag::kHasActiveHangWatchScopeDisabled,
128           old_flags);
129 
130   // If the current HangWatchScopeEnabled is ignored but there are no active
131   // HangWatchScopeDisabled instances, temporarely reactivate hang watching for
132   // this newly created HangWatchScopeEnabled. On exiting hang watching is
133   // suspended again to return to the original state.
134   if (hangs_ignored_for_current_scope && !has_active_hang_watch_disabled) {
135     current_hang_watch_state->UnsetIgnoreCurrentHangWatchScopeEnabled();
136     set_hangs_ignored_on_exit_ = true;
137   }
138 
139   // TODO(crbug.com/1034046): Check whether we are over deadline already for the
140   // previous HangWatchScopeEnabled here by issuing only one TimeTicks::Now()
141   // and resuing the value.
142 
143   previous_deadline_ = old_deadline;
144   TimeTicks deadline = TimeTicks::Now() + timeout;
145   current_hang_watch_state->SetDeadline(deadline);
146   current_hang_watch_state->IncrementNestingLevel();
147 }
148 
~HangWatchScopeEnabled()149 HangWatchScopeEnabled::~HangWatchScopeEnabled() {
150   DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
151 
152   internal::HangWatchState* current_hang_watch_state =
153       internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
154 
155   // TODO(crbug.com/1034046): Remove when all threads using
156   // HangWatchScopeEnabled are monitored. Thread is not monitored, noop.
157   if (!current_hang_watch_state) {
158     return;
159   }
160 
161   // If a hang is currently being captured we should block here so execution
162   // stops and we avoid recording unrelated stack frames in the crash.
163   if (current_hang_watch_state->IsFlagSet(
164           internal::HangWatchDeadline::Flag::kShouldBlockOnHang))
165     base::HangWatcher::GetInstance()->BlockIfCaptureInProgress();
166 
167 #if DCHECK_IS_ON()
168   // Verify that no Scope was destructed out of order.
169   DCHECK_EQ(this, current_hang_watch_state->GetCurrentHangWatchScopeEnabled());
170   current_hang_watch_state->SetCurrentHangWatchScopeEnabled(
171       previous_hang_watch_scope_enable_);
172 #endif
173 
174   // If a HangWatchScopeDisabled suspended hang watching during the
175   // lifetime of this or any nested HangWatchScopeEnabled it can now safely be
176   // reactivated by clearing the ignore bit since this is the outer-most scope.
177   // See HangWatchScopeDisabled class comments where this represents the
178   // destruction of |scope_1|.
179   if (current_hang_watch_state->nesting_level() == 1)
180     current_hang_watch_state->UnsetIgnoreCurrentHangWatchScopeEnabled();
181   // Return to ignoring hangs since this was the previous state before hang
182   // watching was temporarily enabled for this HangWatchScopeEnabled only in the
183   // constructor. See HangWatchScopeDisabled class comments where the next line
184   // of code is part of the destruction of |scope_4|.
185   else if (set_hangs_ignored_on_exit_)
186     current_hang_watch_state->SetIgnoreCurrentHangWatchScopeEnabled();
187 
188   // Reset the deadline to the value it had before entering this
189   // HangWatchScopeEnabled.
190   current_hang_watch_state->SetDeadline(previous_deadline_);
191   // TODO(crbug.com/1034046): Log when a HangWatchScopeEnabled exits after its
192   // deadline and that went undetected by the HangWatcher.
193 
194   current_hang_watch_state->DecrementNestingLevel();
195 }
196 
HangWatchScopeDisabled()197 HangWatchScopeDisabled::HangWatchScopeDisabled() {
198   internal::HangWatchState* current_hang_watch_state =
199       internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
200 
201   if (!current_hang_watch_state) {
202     return;
203   }
204 
205   current_hang_watch_state->SetIgnoreCurrentHangWatchScopeEnabled();
206   current_hang_watch_state->SetHasActiveHangWatchScopeDisabled();
207 }
208 
~HangWatchScopeDisabled()209 HangWatchScopeDisabled::~HangWatchScopeDisabled() {
210   internal::HangWatchState* current_hang_watch_state =
211       internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
212 
213   if (!current_hang_watch_state)
214     return;
215 
216   // If this instance outlived all HangWatchScopeEnabled instances watching
217   // needs to be reactivated.
218   if (current_hang_watch_state->nesting_level() == 0)
219     current_hang_watch_state->UnsetIgnoreCurrentHangWatchScopeEnabled();
220 
221   current_hang_watch_state->UnsetHasActiveHangWatchScopeDisabled();
222 }
223 
224 // static
InitializeOnMainThread()225 void HangWatcher::InitializeOnMainThread() {
226   DCHECK(!g_use_hang_watcher);
227   DCHECK(!g_hang_watch_workers);
228   DCHECK(!g_hang_watch_io_thread);
229   DCHECK(!g_hang_watch_ui_thread);
230 
231   g_use_hang_watcher.store(base::FeatureList::IsEnabled(kEnableHangWatcher),
232                            std::memory_order_relaxed);
233 
234   // If hang watching is disabled as a whole there is no need to read the
235   // params.
236   if (g_use_hang_watcher.load(std::memory_order_relaxed)) {
237     g_hang_watch_workers.store(kHangWatchThreadPool.Get(),
238                                std::memory_order_relaxed);
239     g_hang_watch_io_thread.store(kHangWatchIOThread.Get(),
240                                  std::memory_order_relaxed);
241     g_hang_watch_ui_thread.store(kHangWatchUIThread.Get(),
242                                  std::memory_order_relaxed);
243   }
244 }
245 
246 // static
IsEnabled()247 bool HangWatcher::IsEnabled() {
248   return g_use_hang_watcher.load(std::memory_order_relaxed);
249 }
250 
251 // static
IsThreadPoolHangWatchingEnabled()252 bool HangWatcher::IsThreadPoolHangWatchingEnabled() {
253   return g_hang_watch_workers.load(std::memory_order_relaxed);
254 }
255 
IsIOThreadHangWatchingEnabled()256 bool HangWatcher::IsIOThreadHangWatchingEnabled() {
257   return g_hang_watch_io_thread.load(std::memory_order_relaxed);
258 }
259 
IsUIThreadHangWatchingEnabled()260 bool HangWatcher::IsUIThreadHangWatchingEnabled() {
261   return g_hang_watch_ui_thread.load(std::memory_order_relaxed);
262 }
263 
HangWatcher()264 HangWatcher::HangWatcher()
265     : monitor_period_(kMonitoringPeriod),
266       should_monitor_(WaitableEvent::ResetPolicy::AUTOMATIC),
267       thread_(this, kThreadName),
268       tick_clock_(base::DefaultTickClock::GetInstance()),
269       memory_pressure_listener_(
270           FROM_HERE,
271           base::BindRepeating(&HangWatcher::OnMemoryPressure,
272                               base::Unretained(this))) {
273   // |thread_checker_| should not be bound to the constructing thread.
274   DETACH_FROM_THREAD(hang_watcher_thread_checker_);
275 
276   should_monitor_.declare_only_used_while_idle();
277 
278   DCHECK(!g_instance);
279   g_instance = this;
280 }
281 
282 #if not defined(OS_NACL)
283 debug::ScopedCrashKeyString
GetTimeSinceLastCriticalMemoryPressureCrashKey()284 HangWatcher::GetTimeSinceLastCriticalMemoryPressureCrashKey() {
285   DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_);
286 
287   // The crash key size is large enough to hold the biggest possible return
288   // value from base::TimeDelta::InSeconds().
289   constexpr debug::CrashKeySize kCrashKeyContentSize =
290       debug::CrashKeySize::Size32;
291   DCHECK_GE(static_cast<uint64_t>(kCrashKeyContentSize),
292             base::NumberToString(std::numeric_limits<int64_t>::max()).size());
293 
294   static debug::CrashKeyString* crash_key = AllocateCrashKeyString(
295       "seconds-since-last-memory-pressure", kCrashKeyContentSize);
296 
297   const base::TimeTicks last_critical_memory_pressure_time =
298       last_critical_memory_pressure_.load(std::memory_order_relaxed);
299   if (last_critical_memory_pressure_time.is_null()) {
300     constexpr char kNoMemoryPressureMsg[] = "No critical memory pressure";
301     static_assert(
302         base::size(kNoMemoryPressureMsg) <=
303             static_cast<uint64_t>(kCrashKeyContentSize),
304         "The crash key is too small to hold \"No critical memory pressure\".");
305     return debug::ScopedCrashKeyString(crash_key, kNoMemoryPressureMsg);
306   } else {
307     base::TimeDelta time_since_last_critical_memory_pressure =
308         base::TimeTicks::Now() - last_critical_memory_pressure_time;
309     return debug::ScopedCrashKeyString(
310         crash_key, base::NumberToString(
311                        time_since_last_critical_memory_pressure.InSeconds()));
312   }
313 }
314 #endif
315 
OnMemoryPressure(base::MemoryPressureListener::MemoryPressureLevel memory_pressure_level)316 void HangWatcher::OnMemoryPressure(
317     base::MemoryPressureListener::MemoryPressureLevel memory_pressure_level) {
318   if (memory_pressure_level ==
319       base::MemoryPressureListener::MEMORY_PRESSURE_LEVEL_CRITICAL) {
320     last_critical_memory_pressure_.store(base::TimeTicks::Now(),
321                                          std::memory_order_relaxed);
322   }
323 }
324 
~HangWatcher()325 HangWatcher::~HangWatcher() {
326   DCHECK_EQ(g_instance, this);
327   DCHECK(watch_states_.empty());
328   g_instance = nullptr;
329   Stop();
330 }
331 
Start()332 void HangWatcher::Start() {
333   thread_.Start();
334 }
335 
Stop()336 void HangWatcher::Stop() {
337   keep_monitoring_.store(false, std::memory_order_relaxed);
338   should_monitor_.Signal();
339   thread_.Join();
340 }
341 
IsWatchListEmpty()342 bool HangWatcher::IsWatchListEmpty() {
343   AutoLock auto_lock(watch_state_lock_);
344   return watch_states_.empty();
345 }
346 
Wait()347 void HangWatcher::Wait() {
348   while (true) {
349     // Amount by which the actual time spent sleeping can deviate from
350     // the target time and still be considered timely.
351     constexpr base::TimeDelta kWaitDriftTolerance =
352         base::TimeDelta::FromMilliseconds(100);
353 
354     const base::TimeTicks time_before_wait = tick_clock_->NowTicks();
355 
356     // Sleep until next scheduled monitoring or until signaled.
357     const bool was_signaled = should_monitor_.TimedWait(monitor_period_);
358 
359     if (after_wait_callback_)
360       after_wait_callback_.Run(time_before_wait);
361 
362     const base::TimeTicks time_after_wait = tick_clock_->NowTicks();
363     const base::TimeDelta wait_time = time_after_wait - time_before_wait;
364     const bool wait_was_normal =
365         wait_time <= (monitor_period_ + kWaitDriftTolerance);
366 
367     UMA_HISTOGRAM_TIMES("HangWatcher.SleepDrift.BrowserProcess",
368                         wait_time - monitor_period_);
369 
370     if (!wait_was_normal) {
371       // If the time spent waiting was too high it might indicate the machine is
372       // very slow or that that it went to sleep. In any case we can't trust the
373       // HangWatchScopeEnableds that are currently live. Update the ignore
374       // threshold to make sure they don't trigger a hang on subsequent monitors
375       // then keep waiting.
376 
377       base::AutoLock auto_lock(watch_state_lock_);
378 
379       // Find the latest deadline among the live watch states. They might change
380       // atomically while iterating but that's fine because if they do that
381       // means the new HangWatchScopeEnabled was constructed very soon after the
382       // abnormal sleep happened and might be affected by the root cause still.
383       // Ignoring it is cautious and harmless.
384       base::TimeTicks latest_deadline;
385       for (const auto& state : watch_states_) {
386         base::TimeTicks deadline = state->GetDeadline();
387         if (deadline > latest_deadline) {
388           latest_deadline = deadline;
389         }
390       }
391 
392       deadline_ignore_threshold_ = latest_deadline;
393     }
394 
395     // Stop waiting.
396     if (wait_was_normal || was_signaled)
397       return;
398   }
399 }
400 
Run()401 void HangWatcher::Run() {
402   // Monitor() should only run on |thread_|. Bind |thread_checker_| here to make
403   // sure of that.
404   DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_);
405 
406   while (keep_monitoring_.load(std::memory_order_relaxed)) {
407     Wait();
408 
409     if (!IsWatchListEmpty() &&
410         keep_monitoring_.load(std::memory_order_relaxed)) {
411       Monitor();
412       if (after_monitor_closure_for_testing_) {
413         after_monitor_closure_for_testing_.Run();
414       }
415     }
416   }
417 }
418 
419 // static
GetInstance()420 HangWatcher* HangWatcher::GetInstance() {
421   return g_instance;
422 }
423 
424 // static
RecordHang()425 void HangWatcher::RecordHang() {
426   base::debug::DumpWithoutCrashing();
427   // Inhibit code folding.
428   const int line_number = __LINE__;
429   base::debug::Alias(&line_number);
430 }
431 
RegisterThread(ThreadType thread_type)432 ScopedClosureRunner HangWatcher::RegisterThread(ThreadType thread_type) {
433   AutoLock auto_lock(watch_state_lock_);
434 
435   watch_states_.push_back(
436       internal::HangWatchState::CreateHangWatchStateForCurrentThread(
437           thread_type));
438 
439   return ScopedClosureRunner(BindOnce(&HangWatcher::UnregisterThread,
440                                       Unretained(HangWatcher::GetInstance())));
441 }
442 
GetHighestDeadline() const443 base::TimeTicks HangWatcher::WatchStateSnapShot::GetHighestDeadline() const {
444   DCHECK(!hung_watch_state_copies_.empty());
445   // Since entries are sorted in increasing order the last entry is the largest
446   // one.
447   return hung_watch_state_copies_.back().deadline;
448 }
449 
WatchStateSnapShot(const HangWatchStates & watch_states,base::TimeTicks deadline_ignore_threshold)450 HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
451     const HangWatchStates& watch_states,
452     base::TimeTicks deadline_ignore_threshold) {
453   const base::TimeTicks now = base::TimeTicks::Now();
454   bool all_threads_marked = true;
455   bool found_deadline_before_ignore_threshold = false;
456 
457   // Use an std::array to store the hang counts to avoid allocations. The
458   // numerical values of the HangWatcher::ThreadType enum is used to index into
459   // the array. A |kInvalidHangCount| is used to signify there were no threads
460   // of the type found.
461   constexpr size_t kHangCountArraySize =
462       static_cast<std::size_t>(base::HangWatcher::ThreadType::kMax) + 1;
463   std::array<int, kHangCountArraySize> hung_counts_per_thread_type;
464 
465   constexpr int kInvalidHangCount = -1;
466   hung_counts_per_thread_type.fill(kInvalidHangCount);
467 
468   // Copy hung thread information.
469   for (const auto& watch_state : watch_states) {
470     uint64_t flags;
471     base::TimeTicks deadline;
472     std::tie(flags, deadline) = watch_state->GetFlagsAndDeadline();
473 
474     if (deadline <= deadline_ignore_threshold) {
475       found_deadline_before_ignore_threshold = true;
476     }
477 
478     if (internal::HangWatchDeadline::IsFlagSet(
479             internal::HangWatchDeadline::Flag::
480                 kIgnoreCurrentHangWatchScopeEnabled,
481             flags)) {
482       continue;
483     }
484 
485     // If a thread type is monitored and did not hang it still needs to be
486     // logged as a zero count;
487     const size_t hang_count_index =
488         static_cast<size_t>(watch_state.get()->thread_type());
489     if (hung_counts_per_thread_type[hang_count_index] == kInvalidHangCount) {
490       hung_counts_per_thread_type[hang_count_index] = 0;
491     }
492 
493     // Only copy hung threads.
494     if (deadline <= now) {
495       ++hung_counts_per_thread_type[hang_count_index];
496 
497       // Attempt to mark the thread as needing to stay within its current
498       // HangWatchScopeEnabled until capture is complete.
499       bool thread_marked = watch_state->SetShouldBlockOnHang(flags, deadline);
500 
501       // If marking some threads already failed the snapshot won't be kept so
502       // there is no need to keep adding to it. The loop doesn't abort though
503       // to keep marking the other threads. If these threads remain hung until
504       // the next capture then they'll already be marked and will be included
505       // in the capture at that time.
506       if (thread_marked && all_threads_marked) {
507         hung_watch_state_copies_.push_back(
508             WatchStateCopy{deadline, watch_state.get()->GetThreadID()});
509       } else {
510         all_threads_marked = false;
511       }
512     }
513   }
514 
515   // Log the hung thread counts to histograms for each thread type if any thread
516   // of the type were found.
517   for (size_t i = 0; i < kHangCountArraySize; ++i) {
518     const int hang_count = hung_counts_per_thread_type[i];
519     if (hang_count != kInvalidHangCount)
520       LogHungThreadCountHistogram(static_cast<HangWatcher::ThreadType>(i),
521                                   hang_count);
522   }
523 
524   // Two cases can invalidate this snapshot and prevent the capture of the hang.
525   //
526   // 1. Some threads could not be marked for blocking so this snapshot isn't
527   // actionable since marked threads could be hung because of unmarked ones.
528   // If only the marked threads were captured the information would be
529   // incomplete.
530   //
531   // 2. Any of the threads have a deadline before |deadline_ignore_threshold|.
532   // If any thread is ignored it reduces the confidence in the whole state and
533   // it's better to avoid capturing misleading data.
534   if (!all_threads_marked || found_deadline_before_ignore_threshold) {
535     hung_watch_state_copies_.clear();
536     return;
537   }
538 
539   // Sort |hung_watch_state_copies_| by order of decreasing hang severity so the
540   // most severe hang is first in the list.
541   ranges::sort(hung_watch_state_copies_,
542                [](const WatchStateCopy& lhs, const WatchStateCopy& rhs) {
543                  return lhs.deadline < rhs.deadline;
544                });
545 }
546 
547 HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
548     const WatchStateSnapShot& other) = default;
549 
550 HangWatcher::WatchStateSnapShot::~WatchStateSnapShot() = default;
551 
PrepareHungThreadListCrashKey() const552 std::string HangWatcher::WatchStateSnapShot::PrepareHungThreadListCrashKey()
553     const {
554   DCHECK(IsActionable());
555 
556   // Build a crash key string that contains the ids of the hung threads.
557   constexpr char kSeparator{'|'};
558   std::string list_of_hung_thread_ids;
559 
560   // Add as many thread ids to the crash key as possible.
561   for (const WatchStateCopy& copy : hung_watch_state_copies_) {
562     std::string fragment = base::NumberToString(copy.thread_id) + kSeparator;
563     if (list_of_hung_thread_ids.size() + fragment.size() <
564         static_cast<std::size_t>(debug::CrashKeySize::Size256)) {
565       list_of_hung_thread_ids += fragment;
566     } else {
567       // Respect the by priority ordering of thread ids in the crash key by
568       // stopping the construction as soon as one does not fit. This avoids
569       // including lesser priority ids while omitting more important ones.
570       break;
571     }
572   }
573 
574   return list_of_hung_thread_ids;
575 }
576 
IsActionable() const577 bool HangWatcher::WatchStateSnapShot::IsActionable() const {
578   return !hung_watch_state_copies_.empty();
579 }
580 
GrabWatchStateSnapshotForTesting() const581 HangWatcher::WatchStateSnapShot HangWatcher::GrabWatchStateSnapshotForTesting()
582     const {
583   WatchStateSnapShot snapshot(watch_states_, deadline_ignore_threshold_);
584   return snapshot;
585 }
586 
Monitor()587 void HangWatcher::Monitor() {
588   DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_);
589   AutoLock auto_lock(watch_state_lock_);
590 
591   // If all threads unregistered since this function was invoked there's
592   // nothing to do anymore.
593   if (watch_states_.empty())
594     return;
595 
596   WatchStateSnapShot watch_state_snapshot(watch_states_,
597                                           deadline_ignore_threshold_);
598 
599   if (watch_state_snapshot.IsActionable()) {
600     DoDumpWithoutCrashing(watch_state_snapshot);
601   }
602 }
603 
DoDumpWithoutCrashing(const WatchStateSnapShot & watch_state_snapshot)604 void HangWatcher::DoDumpWithoutCrashing(
605     const WatchStateSnapShot& watch_state_snapshot) {
606   capture_in_progress_.store(true, std::memory_order_relaxed);
607   base::AutoLock scope_lock(capture_lock_);
608 
609 #if not defined(OS_NACL)
610   const std::string list_of_hung_thread_ids =
611       watch_state_snapshot.PrepareHungThreadListCrashKey();
612 
613   static debug::CrashKeyString* crash_key = AllocateCrashKeyString(
614       "list-of-hung-threads", debug::CrashKeySize::Size256);
615 
616   const debug::ScopedCrashKeyString list_of_hung_threads_crash_key_string(
617       crash_key, list_of_hung_thread_ids);
618 
619   const debug::ScopedCrashKeyString
620       time_since_last_critical_memory_pressure_crash_key_string =
621           GetTimeSinceLastCriticalMemoryPressureCrashKey();
622 #endif
623 
624   // To avoid capturing more than one hang that blames a subset of the same
625   // threads it's necessary to keep track of what is the furthest deadline
626   // that contributed to declaring a hang. Only once
627   // all threads have deadlines past this point can we be sure that a newly
628   // discovered hang is not directly related.
629   // Example:
630   // **********************************************************************
631   // Timeline A : L------1-------2----------3-------4----------N-----------
632   // Timeline B : -------2----------3-------4----------L----5------N-------
633   // Timeline C : L----------------------------5------6----7---8------9---N
634   // **********************************************************************
635   // In the example when a Monitor() happens during timeline A
636   // |deadline_ignore_threshold_| (L) is at time zero and deadlines (1-4)
637   // are before Now() (N) . A hang is captured and L is updated. During
638   // the next Monitor() (timeline B) a new deadline is over but we can't
639   // capture a hang because deadlines 2-4 are still live and already counted
640   // toward a hang. During a third monitor (timeline C) all live deadlines
641   // are now after L and a second hang can be recorded.
642   base::TimeTicks latest_expired_deadline =
643       watch_state_snapshot.GetHighestDeadline();
644 
645   if (on_hang_closure_for_testing_)
646     on_hang_closure_for_testing_.Run();
647   else
648     RecordHang();
649 
650   // Update after running the actual capture.
651   deadline_ignore_threshold_ = latest_expired_deadline;
652 
653   capture_in_progress_.store(false, std::memory_order_relaxed);
654 }
655 
SetAfterMonitorClosureForTesting(base::RepeatingClosure closure)656 void HangWatcher::SetAfterMonitorClosureForTesting(
657     base::RepeatingClosure closure) {
658   DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
659   after_monitor_closure_for_testing_ = std::move(closure);
660 }
661 
SetOnHangClosureForTesting(base::RepeatingClosure closure)662 void HangWatcher::SetOnHangClosureForTesting(base::RepeatingClosure closure) {
663   DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
664   on_hang_closure_for_testing_ = std::move(closure);
665 }
666 
SetMonitoringPeriodForTesting(base::TimeDelta period)667 void HangWatcher::SetMonitoringPeriodForTesting(base::TimeDelta period) {
668   DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
669   monitor_period_ = period;
670 }
671 
SetAfterWaitCallbackForTesting(RepeatingCallback<void (TimeTicks)> callback)672 void HangWatcher::SetAfterWaitCallbackForTesting(
673     RepeatingCallback<void(TimeTicks)> callback) {
674   DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
675   after_wait_callback_ = callback;
676 }
677 
SignalMonitorEventForTesting()678 void HangWatcher::SignalMonitorEventForTesting() {
679   DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
680   should_monitor_.Signal();
681 }
682 
StopMonitoringForTesting()683 void HangWatcher::StopMonitoringForTesting() {
684   keep_monitoring_.store(false, std::memory_order_relaxed);
685 }
686 
SetTickClockForTesting(const base::TickClock * tick_clock)687 void HangWatcher::SetTickClockForTesting(const base::TickClock* tick_clock) {
688   tick_clock_ = tick_clock;
689 }
690 
BlockIfCaptureInProgress()691 void HangWatcher::BlockIfCaptureInProgress() {
692   // Makes a best-effort attempt to block execution if a hang is currently being
693   // captured.Only block on |capture_lock| if |capture_in_progress_| hints that
694   // it's already held to avoid serializing all threads on this function when no
695   // hang capture is in-progress.
696   if (capture_in_progress_.load(std::memory_order_relaxed))
697     base::AutoLock hang_lock(capture_lock_);
698 }
699 
UnregisterThread()700 void HangWatcher::UnregisterThread() {
701   AutoLock auto_lock(watch_state_lock_);
702 
703   internal::HangWatchState* current_hang_watch_state =
704       internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
705 
706   auto it = ranges::find_if(
707       watch_states_,
708       [current_hang_watch_state](
709           const std::unique_ptr<internal::HangWatchState>& state) {
710         return state.get() == current_hang_watch_state;
711       });
712 
713   // Thread should be registered to get unregistered.
714   DCHECK(it != watch_states_.end());
715 
716   watch_states_.erase(it);
717 }
718 
719 namespace internal {
720 namespace {
721 
722 constexpr uint64_t kOnlyDeadlineMask = 0x00FFFFFFFFFFFFFFu;
723 constexpr uint64_t kOnlyFlagsMask = ~kOnlyDeadlineMask;
724 constexpr uint64_t kMaximumFlag = 0x8000000000000000u;
725 
726 // Use as a mask to keep persistent flags and the deadline.
727 constexpr uint64_t kPersistentFlagsAndDeadlineMask =
728     kOnlyDeadlineMask |
729     static_cast<uint64_t>(
730         HangWatchDeadline::Flag::kIgnoreCurrentHangWatchScopeEnabled) |
731     static_cast<uint64_t>(
732         HangWatchDeadline::Flag::kHasActiveHangWatchScopeDisabled);
733 }  // namespace
734 
735 // Flag binary representation assertions.
736 static_assert(
737     static_cast<uint64_t>(HangWatchDeadline::Flag::kMinValue) >
738         kOnlyDeadlineMask,
739     "Invalid numerical value for flag. Would interfere with bits of data.");
740 static_assert(static_cast<uint64_t>(HangWatchDeadline::Flag::kMaxValue) <=
741                   kMaximumFlag,
742               "A flag can only set a single bit.");
743 
744 HangWatchDeadline::HangWatchDeadline() = default;
745 HangWatchDeadline::~HangWatchDeadline() = default;
746 
GetFlagsAndDeadline() const747 std::pair<uint64_t, TimeTicks> HangWatchDeadline::GetFlagsAndDeadline() const {
748   uint64_t bits = bits_.load(std::memory_order_relaxed);
749   return std::make_pair(ExtractFlags(bits),
750                         DeadlineFromBits(ExtractDeadline((bits))));
751 }
752 
GetDeadline() const753 TimeTicks HangWatchDeadline::GetDeadline() const {
754   return DeadlineFromBits(
755       ExtractDeadline(bits_.load(std::memory_order_relaxed)));
756 }
757 
758 // static
Max()759 TimeTicks HangWatchDeadline::Max() {
760   // |kOnlyDeadlineMask| has all the bits reserved for the TimeTicks value
761   // set. This means it also represents the highest representable value.
762   return DeadlineFromBits(kOnlyDeadlineMask);
763 }
764 
765 // static
IsFlagSet(Flag flag,uint64_t flags)766 bool HangWatchDeadline::IsFlagSet(Flag flag, uint64_t flags) {
767   return static_cast<uint64_t>(flag) & flags;
768 }
769 
SetDeadline(TimeTicks new_deadline)770 void HangWatchDeadline::SetDeadline(TimeTicks new_deadline) {
771   DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
772   DCHECK(new_deadline <= Max()) << "Value too high to be represented.";
773   DCHECK(new_deadline >= TimeTicks{}) << "Value cannot be negative.";
774 
775   if (switch_bits_callback_for_testing_) {
776     const uint64_t switched_in_bits = SwitchBitsForTesting();
777     // If a concurrent deadline change is tested it cannot have a deadline or
778     // persistent flag change since those always happen on the same thread.
779     DCHECK((switched_in_bits & kPersistentFlagsAndDeadlineMask) == 0u);
780   }
781 
782   // Discard all non-persistent flags and apply deadline change.
783   const uint64_t old_bits = bits_.load(std::memory_order_relaxed);
784   const uint64_t new_flags =
785       ExtractFlags(old_bits & kPersistentFlagsAndDeadlineMask);
786   bits_.store(new_flags | ExtractDeadline(new_deadline.ToInternalValue()),
787               std::memory_order_relaxed);
788 }
789 
790 // TODO(crbug.com/1087026): Add flag DCHECKs here.
SetShouldBlockOnHang(uint64_t old_flags,TimeTicks old_deadline)791 bool HangWatchDeadline::SetShouldBlockOnHang(uint64_t old_flags,
792                                              TimeTicks old_deadline) {
793   DCHECK(old_deadline <= Max()) << "Value too high to be represented.";
794   DCHECK(old_deadline >= TimeTicks{}) << "Value cannot be negative.";
795 
796   // Set the kShouldBlockOnHang flag only if |bits_| did not change since it was
797   // read. kShouldBlockOnHang is the only non-persistent flag and should never
798   // be set twice. Persistent flags and deadline changes are done from the same
799   // thread so there is no risk of losing concurrently added information.
800   uint64_t old_bits =
801       old_flags | static_cast<uint64_t>(old_deadline.ToInternalValue());
802   const uint64_t desired_bits =
803       old_bits | static_cast<uint64_t>(Flag::kShouldBlockOnHang);
804 
805   // If a test needs to simulate |bits_| changing since calling this function
806   // this happens now.
807   if (switch_bits_callback_for_testing_) {
808     const uint64_t switched_in_bits = SwitchBitsForTesting();
809 
810     // Injecting the flag being tested is invalid.
811     DCHECK(!IsFlagSet(Flag::kShouldBlockOnHang, switched_in_bits));
812   }
813 
814   return bits_.compare_exchange_weak(old_bits, desired_bits,
815                                      std::memory_order_relaxed,
816                                      std::memory_order_relaxed);
817 }
818 
SetHasActiveHangWatchScopeDisabled()819 void HangWatchDeadline::SetHasActiveHangWatchScopeDisabled() {
820   SetPersistentFlag(Flag::kHasActiveHangWatchScopeDisabled);
821 }
822 
UnsetHasActiveHangWatchScopeDisabled()823 void HangWatchDeadline::UnsetHasActiveHangWatchScopeDisabled() {
824   ClearPersistentFlag(Flag::kHasActiveHangWatchScopeDisabled);
825 }
826 
SetIgnoreCurrentHangWatchScopeEnabled()827 void HangWatchDeadline::SetIgnoreCurrentHangWatchScopeEnabled() {
828   SetPersistentFlag(Flag::kIgnoreCurrentHangWatchScopeEnabled);
829 }
830 
UnsetIgnoreCurrentHangWatchScopeEnabled()831 void HangWatchDeadline::UnsetIgnoreCurrentHangWatchScopeEnabled() {
832   ClearPersistentFlag(Flag::kIgnoreCurrentHangWatchScopeEnabled);
833 }
834 
SetPersistentFlag(Flag flag)835 void HangWatchDeadline::SetPersistentFlag(Flag flag) {
836   DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
837   if (switch_bits_callback_for_testing_)
838     SwitchBitsForTesting();
839   bits_.fetch_or(static_cast<uint64_t>(flag), std::memory_order_relaxed);
840 }
841 
ClearPersistentFlag(Flag flag)842 void HangWatchDeadline::ClearPersistentFlag(Flag flag) {
843   DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
844   if (switch_bits_callback_for_testing_)
845     SwitchBitsForTesting();
846   bits_.fetch_and(~(static_cast<uint64_t>(flag)), std::memory_order_relaxed);
847 }
848 
849 // static
ExtractFlags(uint64_t bits)850 uint64_t HangWatchDeadline::ExtractFlags(uint64_t bits) {
851   return bits & kOnlyFlagsMask;
852 }
853 
854 // static
ExtractDeadline(uint64_t bits)855 uint64_t HangWatchDeadline::ExtractDeadline(uint64_t bits) {
856   return bits & kOnlyDeadlineMask;
857 }
858 
859 // static
DeadlineFromBits(uint64_t bits)860 TimeTicks HangWatchDeadline::DeadlineFromBits(uint64_t bits) {
861   // |kOnlyDeadlineMask| has all the deadline bits set to 1 so is the largest
862   // representable value.
863   DCHECK(bits <= kOnlyDeadlineMask)
864       << "Flags bits are set. Remove them before returning deadline.";
865   return TimeTicks::FromInternalValue(bits);
866 }
867 
IsFlagSet(Flag flag) const868 bool HangWatchDeadline::IsFlagSet(Flag flag) const {
869   return bits_.load(std::memory_order_relaxed) & static_cast<uint64_t>(flag);
870 }
871 
SetSwitchBitsClosureForTesting(RepeatingCallback<uint64_t (void)> closure)872 void HangWatchDeadline::SetSwitchBitsClosureForTesting(
873     RepeatingCallback<uint64_t(void)> closure) {
874   switch_bits_callback_for_testing_ = closure;
875 }
876 
ResetSwitchBitsClosureForTesting()877 void HangWatchDeadline::ResetSwitchBitsClosureForTesting() {
878   DCHECK(switch_bits_callback_for_testing_);
879   switch_bits_callback_for_testing_.Reset();
880 }
881 
SwitchBitsForTesting()882 uint64_t HangWatchDeadline::SwitchBitsForTesting() {
883   DCHECK(switch_bits_callback_for_testing_);
884 
885   const uint64_t old_bits = bits_.load(std::memory_order_relaxed);
886   const uint64_t new_bits = switch_bits_callback_for_testing_.Run();
887   const uint64_t old_flags = ExtractFlags(old_bits);
888 
889   const uint64_t switched_in_bits = old_flags | new_bits;
890   bits_.store(switched_in_bits, std::memory_order_relaxed);
891   return switched_in_bits;
892 }
893 
HangWatchState(HangWatcher::ThreadType thread_type)894 HangWatchState::HangWatchState(HangWatcher::ThreadType thread_type)
895     : thread_id_(PlatformThread::CurrentId()), thread_type_(thread_type) {
896   // There should not exist a state object for this thread already.
897   DCHECK(!GetHangWatchStateForCurrentThread()->Get());
898 
899   // Bind the new instance to this thread.
900   GetHangWatchStateForCurrentThread()->Set(this);
901 }
902 
~HangWatchState()903 HangWatchState::~HangWatchState() {
904   DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
905 
906   DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), this);
907   GetHangWatchStateForCurrentThread()->Set(nullptr);
908 
909 #if DCHECK_IS_ON()
910   // Destroying the HangWatchState should not be done if there are live
911   // HangWatchScopeEnableds.
912   DCHECK(!current_hang_watch_scope_enable_);
913 #endif
914 }
915 
916 // static
917 std::unique_ptr<HangWatchState>
CreateHangWatchStateForCurrentThread(HangWatcher::ThreadType thread_type)918 HangWatchState::CreateHangWatchStateForCurrentThread(
919     HangWatcher::ThreadType thread_type) {
920   // Allocate a watch state object for this thread.
921   std::unique_ptr<HangWatchState> hang_state =
922       std::make_unique<HangWatchState>(thread_type);
923 
924   // Setting the thread local worked.
925   DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), hang_state.get());
926 
927   // Transfer ownership to caller.
928   return hang_state;
929 }
930 
GetDeadline() const931 TimeTicks HangWatchState::GetDeadline() const {
932   return deadline_.GetDeadline();
933 }
934 
GetFlagsAndDeadline() const935 std::pair<uint64_t, TimeTicks> HangWatchState::GetFlagsAndDeadline() const {
936   return deadline_.GetFlagsAndDeadline();
937 }
938 
SetDeadline(TimeTicks deadline)939 void HangWatchState::SetDeadline(TimeTicks deadline) {
940   DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
941   deadline_.SetDeadline(deadline);
942 }
943 
IsOverDeadline() const944 bool HangWatchState::IsOverDeadline() const {
945   return TimeTicks::Now() > deadline_.GetDeadline();
946 }
947 
SetHasActiveHangWatchScopeDisabled()948 void HangWatchState::SetHasActiveHangWatchScopeDisabled() {
949   deadline_.SetHasActiveHangWatchScopeDisabled();
950 }
951 
UnsetHasActiveHangWatchScopeDisabled()952 void HangWatchState::UnsetHasActiveHangWatchScopeDisabled() {
953   deadline_.UnsetHasActiveHangWatchScopeDisabled();
954 }
955 
SetIgnoreCurrentHangWatchScopeEnabled()956 void HangWatchState::SetIgnoreCurrentHangWatchScopeEnabled() {
957   deadline_.SetIgnoreCurrentHangWatchScopeEnabled();
958 }
959 
UnsetIgnoreCurrentHangWatchScopeEnabled()960 void HangWatchState::UnsetIgnoreCurrentHangWatchScopeEnabled() {
961   deadline_.UnsetIgnoreCurrentHangWatchScopeEnabled();
962 }
963 
SetShouldBlockOnHang(uint64_t old_flags,TimeTicks old_deadline)964 bool HangWatchState::SetShouldBlockOnHang(uint64_t old_flags,
965                                           TimeTicks old_deadline) {
966   return deadline_.SetShouldBlockOnHang(old_flags, old_deadline);
967 }
968 
IsFlagSet(HangWatchDeadline::Flag flag)969 bool HangWatchState::IsFlagSet(HangWatchDeadline::Flag flag) {
970   return deadline_.IsFlagSet(flag);
971 }
972 
973 #if DCHECK_IS_ON()
SetCurrentHangWatchScopeEnabled(HangWatchScopeEnabled * current_hang_watch_scope_enable)974 void HangWatchState::SetCurrentHangWatchScopeEnabled(
975     HangWatchScopeEnabled* current_hang_watch_scope_enable) {
976   DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
977   current_hang_watch_scope_enable_ = current_hang_watch_scope_enable;
978 }
979 
GetCurrentHangWatchScopeEnabled()980 HangWatchScopeEnabled* HangWatchState::GetCurrentHangWatchScopeEnabled() {
981   DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
982   return current_hang_watch_scope_enable_;
983 }
984 #endif
985 
GetHangWatchDeadlineForTesting()986 HangWatchDeadline* HangWatchState::GetHangWatchDeadlineForTesting() {
987   return &deadline_;
988 }
989 
IncrementNestingLevel()990 void HangWatchState::IncrementNestingLevel() {
991   ++nesting_level_;
992 }
993 
DecrementNestingLevel()994 void HangWatchState::DecrementNestingLevel() {
995   --nesting_level_;
996 }
997 
998 // static
999 ThreadLocalPointer<HangWatchState>*
GetHangWatchStateForCurrentThread()1000 HangWatchState::GetHangWatchStateForCurrentThread() {
1001   static NoDestructor<ThreadLocalPointer<HangWatchState>> hang_watch_state;
1002   return hang_watch_state.get();
1003 }
1004 
GetThreadID() const1005 PlatformThreadId HangWatchState::GetThreadID() const {
1006   return thread_id_;
1007 }
1008 
1009 }  // namespace internal
1010 
1011 }  // namespace base
1012