1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef GPU_IPC_SERVICE_GPU_WATCHDOG_THREAD_H_
6 #define GPU_IPC_SERVICE_GPU_WATCHDOG_THREAD_H_
7 
8 #include "base/atomicops.h"
9 #include "base/macros.h"
10 #include "base/memory/ref_counted.h"
11 #include "base/memory/weak_ptr.h"
12 #include "base/metrics/histogram_macros.h"
13 #include "base/power_monitor/power_observer.h"
14 #include "base/task/task_observer.h"
15 #include "base/threading/thread.h"
16 #include "base/time/time.h"
17 #include "build/build_config.h"
18 #include "gpu/ipc/common/gpu_watchdog_timeout.h"
19 #include "gpu/ipc/service/gpu_ipc_service_export.h"
20 #include "ui/gfx/native_widget_types.h"
21 #include "ui/gl/progress_reporter.h"
22 
23 namespace gpu {
24 
25 // These values are persisted to logs. Entries should not be renumbered and
26 // numeric values should never be reused.
27 enum class GpuWatchdogThreadEvent {
28   kGpuWatchdogStart,
29   kGpuWatchdogKill,
30   kGpuWatchdogEnd,
31   kMaxValue = kGpuWatchdogEnd,
32 };
33 
34 // These values are persisted to logs. Entries should not be renumbered and
35 // numeric values should never be reused.
36 enum class GpuWatchdogTimeoutEvent {
37   // Recorded each time OnWatchdogTimeout() is called.
38   kTimeout,
39   // Recorded when a GPU main thread is killed for a detected hang.
40   kKill,
41   // Window only: Recorded when a hang is detected but we allow the GPU main
42   // thread to continue until it spent the full
43   // thread time doing the work.
44   kMoreThreadTime,
45   // Windows only: The GPU makes progress after givenmore thread time. The GPU
46   // main thread is not killed.
47   kProgressAfterMoreThreadTime,
48   // A gpu hang is detected but watchdog waits for 60 seconds before taking
49   // action.
50   kTimeoutWait,
51   // The GPU makes progress within 60 sec in OnWatchdogTimeout(). The GPU main
52   // thread is not killed.
53   kProgressAfterWait,
54   // Just continue if it's not on the TTY of our host X11 server.
55   kContinueOnNonHostServerTty,
56   // Windows only: After detecting GPU hang and continuing running through
57   // OnGpuWatchdogTimeout for the max cycles, the GPU main thread still cannot
58   // get the full thread time.
59   kLessThanFullThreadTimeAfterCapped,
60   // Windows only: The GPU main thread went through the
61   // kLessThanFullThreadTimeAfterCapped stage before the process is killed.
62   kKillOnLessThreadTime,
63   // OnWatchdogTimeout() is called long after the expected time. The GPU is not
64   // killed this time because of the slow system.
65   kSlowWatchdogThread,
66   kMaxValue = kSlowWatchdogThread,
67 };
68 
69 #if defined(OS_WIN)
70 // If the actual time the watched GPU thread spent doing actual work is less
71 // than the watchdog timeout, the GPU thread can continue running through
72 // OnGPUWatchdogTimeout for at most 4 times before the gpu thread is killed.
73 constexpr int kMaxCountOfMoreGpuThreadTimeAllowed = 3;
74 #endif
75 constexpr int kMaxExtraCyclesBeforeKill = 0;
76 
77 // A thread that intermitently sends tasks to a group of watched message loops
78 // and deliberately crashes if one of them does not respond after a timeout.
79 class GPU_IPC_SERVICE_EXPORT GpuWatchdogThread : public base::Thread,
80                                                  public base::PowerObserver,
81                                                  public base::TaskObserver,
82                                                  public gl::ProgressReporter {
83  public:
84   static std::unique_ptr<GpuWatchdogThread> Create(bool start_backgrounded);
85 
86   static std::unique_ptr<GpuWatchdogThread> Create(
87       bool start_backgrounded,
88       base::TimeDelta timeout,
89       int init_factor,
90       int restart_factor,
91       int max_extra_cycles_before_kill,
92       bool test_mode);
93 
94   ~GpuWatchdogThread() override;
95 
96   // Must be called after a PowerMonitor has been created. Can be called from
97   // any thread.
98   void AddPowerObserver();
99 
100   // Notifies the watchdog when Chrome is backgrounded / foregrounded. Should
101   // only be used if Chrome is completely backgrounded and not expected to
102   // render (all windows backgrounded and not producing frames).
103   void OnBackgrounded();
104   void OnForegrounded();
105 
106   // The watchdog starts armed to catch startup hangs, and needs to be disarmed
107   // once init is complete, before executing tasks.
108   void OnInitComplete();
109 
110   // Notifies the watchdog when the GPU child process is being destroyed.
111   // This function is called directly from
112   // viz::GpuServiceImpl::~GpuServiceImpl()
113   void OnGpuProcessTearDown();
114 
115   // Pause the GPU watchdog to stop the timeout task. If the current heavy task
116   // is not running on the GPU driver, the watchdog can be paused to avoid
117   // unneeded crash.
118   void PauseWatchdog();
119   // Continue the watchdog after a pause.
120   void ResumeWatchdog();
121 
122   // For gpu testing only. Return status for the watchdog tests
123   bool IsGpuHangDetectedForTesting();
124 
125   void WaitForPowerObserverAddedForTesting();
126 
127   // Implements base::Thread.
128   void Init() override;
129   void CleanUp() override;
130 
131   // Implements gl::ProgressReporter.
132   void ReportProgress() override;
133 
134   // Implements TaskObserver.
135   void WillProcessTask(const base::PendingTask& pending_task,
136                        bool was_blocked_or_low_priority) override;
137   void DidProcessTask(const base::PendingTask& pending_task) override;
138 
139   // Implements base::PowerObserver.
140   void OnSuspend() override;
141   void OnResume() override;
142 
143  protected:
144   GpuWatchdogThread();
145 
146  private:
147   enum PauseResumeSource {
148     kAndroidBackgroundForeground = 0,
149     kPowerSuspendResume = 1,
150     kGeneralGpuFlow = 2,
151   };
152 
153   GpuWatchdogThread(base::TimeDelta timeout,
154                     int init_factor,
155                     int restart_factor,
156                     int max_extra_cycles_before_kill,
157                     bool test_mode);
158   void OnAddPowerObserver();
159   void RestartWatchdogTimeoutTask(PauseResumeSource source_of_request);
160   void StopWatchdogTimeoutTask(PauseResumeSource source_of_request);
161   void UpdateInitializationFlag();
162   void Arm();
163   void Disarm();
164   void InProgress();
165   bool IsArmed();
166   base::subtle::Atomic32 ReadArmDisarmCounter();
167   void OnWatchdogTimeout();
168   bool SlowWatchdogThread();
169   bool WatchedThreadNeedsMoreThreadTime(bool no_gpu_hang_detected);
170 #if defined(OS_WIN)
171   base::ThreadTicks GetWatchedThreadTime();
172 #endif
173   bool WatchedThreadGetsExtraTimeout(bool no_gpu_hang);
174 
175   // Do not change the function name. It is used for [GPU HANG] carsh reports.
176   void DeliberatelyTerminateToRecoverFromHang();
177 
178   // Records "GPU.WatchdogThread.Event".
179   void GpuWatchdogHistogram(GpuWatchdogThreadEvent thread_event);
180 
181   // Histogram recorded in OnWatchdogTimeout()
182   // Records "GPU.WatchdogThread.Timeout"
183   void GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent timeout_event);
184 
185 #if defined(OS_WIN)
186   // The extra thread time the GPU main thread needs to make a progress.
187   // Records "GPU.WatchdogThread.ExtraThreadTime".
188   void RecordExtraThreadTimeHistogram();
189   // The number of users per timeout stay in Chrome after giving extra thread
190   // time. Records "GPU.WatchdogThread.ExtraThreadTime.NumOfUsers" and
191   // "GPU.WatchdogThread.Timeout".
192   void RecordNumOfUsersWaitingWithExtraThreadTimeHistogram(int count);
193 
194   // Histograms recorded for WatchedThreadNeedsMoreThreadTime() function.
195   void WatchedThreadNeedsMoreThreadTimeHistogram(
196       bool no_gpu_hang_detected,
197       bool start_of_more_thread_time);
198 #endif
199 
200   // The number of users stay in Chrome after the extra timeout wait cycles.
201   // Records "GPU.WatchdogThread.WaitTime.ProgressAfterWait",
202   // "GPU.WatchdogThread.WaitTime.NumOfUsers" and "GPU.WatchdogThread.Timeout".
203   void WatchedThreadGetsExtraTimeoutHistogram(bool no_gpu_hang);
204 
205   // Used for metrics. It's 1 minute after the event.
206   bool WithinOneMinFromPowerResumed();
207   bool WithinOneMinFromForegrounded();
208 
209 #if defined(USE_X11) && !defined(OS_BSD)
210   void UpdateActiveTTY();
211 #endif
212   // The watchdog continues when it's not on the TTY of our host X11 server.
213   bool ContinueOnNonHostX11ServerTty();
214 
215   // This counter is only written on the gpu thread, and read on both threads.
216   volatile base::subtle::Atomic32 arm_disarm_counter_ = 0;
217   // The counter number read in the last OnWatchdogTimeout() on the watchdog
218   // thread.
219   int32_t last_arm_disarm_counter_ = 0;
220 
221   // Timeout on the watchdog thread to check if gpu hangs.
222   base::TimeDelta watchdog_timeout_;
223 
224   // The one-time watchdog timeout multiplier in the gpu initialization.
225   int watchdog_init_factor_;
226 
227   // The one-time watchdog timeout multiplier after the watchdog pauses and
228   // restarts.
229   int watchdog_restart_factor_;
230 
231   // The time the gpu watchdog was created.
232   base::TimeTicks watchdog_start_timeticks_;
233 
234   // The time the last OnSuspend and OnResume was called.
235   base::TimeTicks power_suspend_timeticks_;
236   base::TimeTicks power_resume_timeticks_;
237 
238   // The time the last OnBackgrounded and OnForegrounded was called.
239   base::TimeTicks backgrounded_timeticks_;
240   base::TimeTicks foregrounded_timeticks_;
241 
242   // The time PauseWatchdog and ResumeWatchdog was called.
243   base::TimeTicks watchdog_pause_timeticks_;
244   base::TimeTicks watchdog_resume_timeticks_;
245 
246   // TimeTicks: Tracking the amount of time a task runs. Executing delayed
247   //            tasks at the right time.
248   // ThreadTicks: Use this timer to (approximately) measure how much time the
249   // calling thread spent doing actual work vs. being de-scheduled.
250 
251   // The time the last OnWatchdogTimeout() was called.
252   base::TimeTicks last_on_watchdog_timeout_timeticks_;
253 
254   // The wall-clock time the next OnWatchdogTimeout() will be called.
255   base::Time next_on_watchdog_timeout_time_;
256 
257 #if defined(OS_WIN)
258   base::ThreadTicks last_on_watchdog_timeout_thread_ticks_;
259 
260   // The difference between the timeout and the actual time the watched thread
261   // spent doing actual work.
262   base::TimeDelta remaining_watched_thread_ticks_;
263 
264   // The Windows thread hanndle of the watched GPU main thread.
265   void* watched_thread_handle_ = nullptr;
266 
267   // After GPU hang detected, how many times has the GPU thread been allowed to
268   // continue due to not enough thread time.
269   int count_of_more_gpu_thread_time_allowed_ = 0;
270 
271   // The total timeout, up to 60 seconds, the watchdog thread waits for the GPU
272   // main thread to get full thread time.
273   base::TimeDelta time_in_wait_for_full_thread_time_;
274 
275   // After detecting GPU hang and continuing running through
276   // OnGpuWatchdogTimeout for the max cycles, the GPU main thread still cannot
277   // get the full thread time.
278   bool less_than_full_thread_time_after_capped_ = false;
279 #endif
280 
281 #if defined(USE_X11) && !defined(OS_BSD)
282   FILE* tty_file_ = nullptr;
283   int host_tty_ = -1;
284   int active_tty_ = -1;
285   int last_active_tty_ = -1;
286 #endif
287 
288   // The system has entered the power suspension mode.
289   bool in_power_suspension_ = false;
290 
291   // The GPU process has started tearing down. Accessed only in the gpu process.
292   bool in_gpu_process_teardown_ = false;
293 
294   // Chrome is running on the background on Android. Gpu is probably very slow
295   // or stalled.
296   bool is_backgrounded_ = false;
297 
298   // The GPU watchdog is paused. The timeout task is temporarily stopped.
299   bool is_paused_ = false;
300 
301   // Whether the watchdog thread has been called and added to the power monitor
302   // observer.
303   bool is_add_power_observer_called_ = false;
304   bool is_power_observer_added_ = false;
305 
306   // whether GpuWatchdogThreadEvent::kGpuWatchdogStart has been recorded.
307   bool is_watchdog_start_histogram_recorded = false;
308 
309   // Read/Write by the watchdog thread only after initialized in the
310   // constructor.
311   bool in_gpu_initialization_ = false;
312 
313   // The number of logical processors/cores on the current machine.
314   int num_of_processors_ = 0;
315 
316   // Don't kill the GPU process immediately after a gpu hang is detected. Wait
317   // for extra cycles of timeout. Kill it, if the GPU still doesn't respond
318   // after wait.
319   const int max_extra_cycles_before_kill_;
320   // how many cycles of timeout since we detect a hang.
321   int count_of_extra_cycles_ = 0;
322 
323   // For the experiment and the debugging purpose
324   size_t num_of_timeout_after_power_resume_ = 0;
325   size_t num_of_timeout_after_foregrounded_ = 0;
326   bool foregrounded_event_ = false;
327   bool power_resumed_event_ = false;
328 
329   // For gpu testing only.
330   const bool is_test_mode_;
331   // Set by the watchdog thread and Read by the test thread.
332   base::AtomicFlag test_result_timeout_and_gpu_hang_;
333 
334   scoped_refptr<base::SingleThreadTaskRunner> watched_gpu_task_runner_;
335   scoped_refptr<base::SingleThreadTaskRunner> watchdog_thread_task_runner_;
336 
337   base::WeakPtr<GpuWatchdogThread> weak_ptr_;
338   base::WeakPtrFactory<GpuWatchdogThread> weak_factory_{this};
339 
340   DISALLOW_COPY_AND_ASSIGN(GpuWatchdogThread);
341 };
342 
343 }  // namespace gpu
344 #endif  // GPU_IPC_SERVICE_GPU_WATCHDOG_THREAD_H_
345