1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef GPU_IPC_SERVICE_GPU_WATCHDOG_THREAD_H_ 6 #define GPU_IPC_SERVICE_GPU_WATCHDOG_THREAD_H_ 7 8 #include "base/atomicops.h" 9 #include "base/macros.h" 10 #include "base/memory/ref_counted.h" 11 #include "base/memory/weak_ptr.h" 12 #include "base/metrics/histogram_macros.h" 13 #include "base/power_monitor/power_observer.h" 14 #include "base/task/task_observer.h" 15 #include "base/threading/thread.h" 16 #include "base/time/time.h" 17 #include "build/build_config.h" 18 #include "gpu/ipc/common/gpu_watchdog_timeout.h" 19 #include "gpu/ipc/service/gpu_ipc_service_export.h" 20 #include "ui/gfx/native_widget_types.h" 21 #include "ui/gl/progress_reporter.h" 22 23 namespace gpu { 24 25 // These values are persisted to logs. Entries should not be renumbered and 26 // numeric values should never be reused. 27 enum class GpuWatchdogThreadEvent { 28 kGpuWatchdogStart, 29 kGpuWatchdogKill, 30 kGpuWatchdogEnd, 31 kMaxValue = kGpuWatchdogEnd, 32 }; 33 34 // These values are persisted to logs. Entries should not be renumbered and 35 // numeric values should never be reused. 36 enum class GpuWatchdogTimeoutEvent { 37 // Recorded each time OnWatchdogTimeout() is called. 38 kTimeout, 39 // Recorded when a GPU main thread is killed for a detected hang. 40 kKill, 41 // Window only: Recorded when a hang is detected but we allow the GPU main 42 // thread to continue until it spent the full 43 // thread time doing the work. 44 kMoreThreadTime, 45 // Windows only: The GPU makes progress after givenmore thread time. The GPU 46 // main thread is not killed. 47 kProgressAfterMoreThreadTime, 48 // A gpu hang is detected but watchdog waits for 60 seconds before taking 49 // action. 50 kTimeoutWait, 51 // The GPU makes progress within 60 sec in OnWatchdogTimeout(). The GPU main 52 // thread is not killed. 53 kProgressAfterWait, 54 // Just continue if it's not on the TTY of our host X11 server. 55 kContinueOnNonHostServerTty, 56 // Windows only: After detecting GPU hang and continuing running through 57 // OnGpuWatchdogTimeout for the max cycles, the GPU main thread still cannot 58 // get the full thread time. 59 kLessThanFullThreadTimeAfterCapped, 60 // Windows only: The GPU main thread went through the 61 // kLessThanFullThreadTimeAfterCapped stage before the process is killed. 62 kKillOnLessThreadTime, 63 // OnWatchdogTimeout() is called long after the expected time. The GPU is not 64 // killed this time because of the slow system. 65 kSlowWatchdogThread, 66 kMaxValue = kSlowWatchdogThread, 67 }; 68 69 #if defined(OS_WIN) 70 // If the actual time the watched GPU thread spent doing actual work is less 71 // than the watchdog timeout, the GPU thread can continue running through 72 // OnGPUWatchdogTimeout for at most 4 times before the gpu thread is killed. 73 constexpr int kMaxCountOfMoreGpuThreadTimeAllowed = 3; 74 #endif 75 constexpr int kMaxExtraCyclesBeforeKill = 0; 76 77 // A thread that intermitently sends tasks to a group of watched message loops 78 // and deliberately crashes if one of them does not respond after a timeout. 79 class GPU_IPC_SERVICE_EXPORT GpuWatchdogThread : public base::Thread, 80 public base::PowerObserver, 81 public base::TaskObserver, 82 public gl::ProgressReporter { 83 public: 84 static std::unique_ptr<GpuWatchdogThread> Create(bool start_backgrounded); 85 86 static std::unique_ptr<GpuWatchdogThread> Create( 87 bool start_backgrounded, 88 base::TimeDelta timeout, 89 int init_factor, 90 int restart_factor, 91 int max_extra_cycles_before_kill, 92 bool test_mode); 93 94 ~GpuWatchdogThread() override; 95 96 // Must be called after a PowerMonitor has been created. Can be called from 97 // any thread. 98 void AddPowerObserver(); 99 100 // Notifies the watchdog when Chrome is backgrounded / foregrounded. Should 101 // only be used if Chrome is completely backgrounded and not expected to 102 // render (all windows backgrounded and not producing frames). 103 void OnBackgrounded(); 104 void OnForegrounded(); 105 106 // The watchdog starts armed to catch startup hangs, and needs to be disarmed 107 // once init is complete, before executing tasks. 108 void OnInitComplete(); 109 110 // Notifies the watchdog when the GPU child process is being destroyed. 111 // This function is called directly from 112 // viz::GpuServiceImpl::~GpuServiceImpl() 113 void OnGpuProcessTearDown(); 114 115 // Pause the GPU watchdog to stop the timeout task. If the current heavy task 116 // is not running on the GPU driver, the watchdog can be paused to avoid 117 // unneeded crash. 118 void PauseWatchdog(); 119 // Continue the watchdog after a pause. 120 void ResumeWatchdog(); 121 122 // For gpu testing only. Return status for the watchdog tests 123 bool IsGpuHangDetectedForTesting(); 124 125 void WaitForPowerObserverAddedForTesting(); 126 127 // Implements base::Thread. 128 void Init() override; 129 void CleanUp() override; 130 131 // Implements gl::ProgressReporter. 132 void ReportProgress() override; 133 134 // Implements TaskObserver. 135 void WillProcessTask(const base::PendingTask& pending_task, 136 bool was_blocked_or_low_priority) override; 137 void DidProcessTask(const base::PendingTask& pending_task) override; 138 139 // Implements base::PowerObserver. 140 void OnSuspend() override; 141 void OnResume() override; 142 143 protected: 144 GpuWatchdogThread(); 145 146 private: 147 enum PauseResumeSource { 148 kAndroidBackgroundForeground = 0, 149 kPowerSuspendResume = 1, 150 kGeneralGpuFlow = 2, 151 }; 152 153 GpuWatchdogThread(base::TimeDelta timeout, 154 int init_factor, 155 int restart_factor, 156 int max_extra_cycles_before_kill, 157 bool test_mode); 158 void OnAddPowerObserver(); 159 void RestartWatchdogTimeoutTask(PauseResumeSource source_of_request); 160 void StopWatchdogTimeoutTask(PauseResumeSource source_of_request); 161 void UpdateInitializationFlag(); 162 void Arm(); 163 void Disarm(); 164 void InProgress(); 165 bool IsArmed(); 166 base::subtle::Atomic32 ReadArmDisarmCounter(); 167 void OnWatchdogTimeout(); 168 bool SlowWatchdogThread(); 169 bool WatchedThreadNeedsMoreThreadTime(bool no_gpu_hang_detected); 170 #if defined(OS_WIN) 171 base::ThreadTicks GetWatchedThreadTime(); 172 #endif 173 bool WatchedThreadGetsExtraTimeout(bool no_gpu_hang); 174 175 // Do not change the function name. It is used for [GPU HANG] carsh reports. 176 void DeliberatelyTerminateToRecoverFromHang(); 177 178 // Records "GPU.WatchdogThread.Event". 179 void GpuWatchdogHistogram(GpuWatchdogThreadEvent thread_event); 180 181 // Histogram recorded in OnWatchdogTimeout() 182 // Records "GPU.WatchdogThread.Timeout" 183 void GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent timeout_event); 184 185 #if defined(OS_WIN) 186 // The extra thread time the GPU main thread needs to make a progress. 187 // Records "GPU.WatchdogThread.ExtraThreadTime". 188 void RecordExtraThreadTimeHistogram(); 189 // The number of users per timeout stay in Chrome after giving extra thread 190 // time. Records "GPU.WatchdogThread.ExtraThreadTime.NumOfUsers" and 191 // "GPU.WatchdogThread.Timeout". 192 void RecordNumOfUsersWaitingWithExtraThreadTimeHistogram(int count); 193 194 // Histograms recorded for WatchedThreadNeedsMoreThreadTime() function. 195 void WatchedThreadNeedsMoreThreadTimeHistogram( 196 bool no_gpu_hang_detected, 197 bool start_of_more_thread_time); 198 #endif 199 200 // The number of users stay in Chrome after the extra timeout wait cycles. 201 // Records "GPU.WatchdogThread.WaitTime.ProgressAfterWait", 202 // "GPU.WatchdogThread.WaitTime.NumOfUsers" and "GPU.WatchdogThread.Timeout". 203 void WatchedThreadGetsExtraTimeoutHistogram(bool no_gpu_hang); 204 205 // Used for metrics. It's 1 minute after the event. 206 bool WithinOneMinFromPowerResumed(); 207 bool WithinOneMinFromForegrounded(); 208 209 #if defined(USE_X11) && !defined(OS_BSD) 210 void UpdateActiveTTY(); 211 #endif 212 // The watchdog continues when it's not on the TTY of our host X11 server. 213 bool ContinueOnNonHostX11ServerTty(); 214 215 // This counter is only written on the gpu thread, and read on both threads. 216 volatile base::subtle::Atomic32 arm_disarm_counter_ = 0; 217 // The counter number read in the last OnWatchdogTimeout() on the watchdog 218 // thread. 219 int32_t last_arm_disarm_counter_ = 0; 220 221 // Timeout on the watchdog thread to check if gpu hangs. 222 base::TimeDelta watchdog_timeout_; 223 224 // The one-time watchdog timeout multiplier in the gpu initialization. 225 int watchdog_init_factor_; 226 227 // The one-time watchdog timeout multiplier after the watchdog pauses and 228 // restarts. 229 int watchdog_restart_factor_; 230 231 // The time the gpu watchdog was created. 232 base::TimeTicks watchdog_start_timeticks_; 233 234 // The time the last OnSuspend and OnResume was called. 235 base::TimeTicks power_suspend_timeticks_; 236 base::TimeTicks power_resume_timeticks_; 237 238 // The time the last OnBackgrounded and OnForegrounded was called. 239 base::TimeTicks backgrounded_timeticks_; 240 base::TimeTicks foregrounded_timeticks_; 241 242 // The time PauseWatchdog and ResumeWatchdog was called. 243 base::TimeTicks watchdog_pause_timeticks_; 244 base::TimeTicks watchdog_resume_timeticks_; 245 246 // TimeTicks: Tracking the amount of time a task runs. Executing delayed 247 // tasks at the right time. 248 // ThreadTicks: Use this timer to (approximately) measure how much time the 249 // calling thread spent doing actual work vs. being de-scheduled. 250 251 // The time the last OnWatchdogTimeout() was called. 252 base::TimeTicks last_on_watchdog_timeout_timeticks_; 253 254 // The wall-clock time the next OnWatchdogTimeout() will be called. 255 base::Time next_on_watchdog_timeout_time_; 256 257 #if defined(OS_WIN) 258 base::ThreadTicks last_on_watchdog_timeout_thread_ticks_; 259 260 // The difference between the timeout and the actual time the watched thread 261 // spent doing actual work. 262 base::TimeDelta remaining_watched_thread_ticks_; 263 264 // The Windows thread hanndle of the watched GPU main thread. 265 void* watched_thread_handle_ = nullptr; 266 267 // After GPU hang detected, how many times has the GPU thread been allowed to 268 // continue due to not enough thread time. 269 int count_of_more_gpu_thread_time_allowed_ = 0; 270 271 // The total timeout, up to 60 seconds, the watchdog thread waits for the GPU 272 // main thread to get full thread time. 273 base::TimeDelta time_in_wait_for_full_thread_time_; 274 275 // After detecting GPU hang and continuing running through 276 // OnGpuWatchdogTimeout for the max cycles, the GPU main thread still cannot 277 // get the full thread time. 278 bool less_than_full_thread_time_after_capped_ = false; 279 #endif 280 281 #if defined(USE_X11) && !defined(OS_BSD) 282 FILE* tty_file_ = nullptr; 283 int host_tty_ = -1; 284 int active_tty_ = -1; 285 int last_active_tty_ = -1; 286 #endif 287 288 // The system has entered the power suspension mode. 289 bool in_power_suspension_ = false; 290 291 // The GPU process has started tearing down. Accessed only in the gpu process. 292 bool in_gpu_process_teardown_ = false; 293 294 // Chrome is running on the background on Android. Gpu is probably very slow 295 // or stalled. 296 bool is_backgrounded_ = false; 297 298 // The GPU watchdog is paused. The timeout task is temporarily stopped. 299 bool is_paused_ = false; 300 301 // Whether the watchdog thread has been called and added to the power monitor 302 // observer. 303 bool is_add_power_observer_called_ = false; 304 bool is_power_observer_added_ = false; 305 306 // whether GpuWatchdogThreadEvent::kGpuWatchdogStart has been recorded. 307 bool is_watchdog_start_histogram_recorded = false; 308 309 // Read/Write by the watchdog thread only after initialized in the 310 // constructor. 311 bool in_gpu_initialization_ = false; 312 313 // The number of logical processors/cores on the current machine. 314 int num_of_processors_ = 0; 315 316 // Don't kill the GPU process immediately after a gpu hang is detected. Wait 317 // for extra cycles of timeout. Kill it, if the GPU still doesn't respond 318 // after wait. 319 const int max_extra_cycles_before_kill_; 320 // how many cycles of timeout since we detect a hang. 321 int count_of_extra_cycles_ = 0; 322 323 // For the experiment and the debugging purpose 324 size_t num_of_timeout_after_power_resume_ = 0; 325 size_t num_of_timeout_after_foregrounded_ = 0; 326 bool foregrounded_event_ = false; 327 bool power_resumed_event_ = false; 328 329 // For gpu testing only. 330 const bool is_test_mode_; 331 // Set by the watchdog thread and Read by the test thread. 332 base::AtomicFlag test_result_timeout_and_gpu_hang_; 333 334 scoped_refptr<base::SingleThreadTaskRunner> watched_gpu_task_runner_; 335 scoped_refptr<base::SingleThreadTaskRunner> watchdog_thread_task_runner_; 336 337 base::WeakPtr<GpuWatchdogThread> weak_ptr_; 338 base::WeakPtrFactory<GpuWatchdogThread> weak_factory_{this}; 339 340 DISALLOW_COPY_AND_ASSIGN(GpuWatchdogThread); 341 }; 342 343 } // namespace gpu 344 #endif // GPU_IPC_SERVICE_GPU_WATCHDOG_THREAD_H_ 345