1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "gpu/ipc/service/gpu_watchdog_thread.h"
6 
7 #include "base/atomicops.h"
8 #include "base/bind.h"
9 #include "base/bit_cast.h"
10 #include "base/callback_helpers.h"
11 #include "base/debug/alias.h"
12 #include "base/debug/dump_without_crashing.h"
13 #include "base/files/file_path.h"
14 #include "base/files/file_util.h"
15 #include "base/memory/ptr_util.h"
16 #include "base/metrics/field_trial_params.h"
17 #include "base/metrics/histogram_functions.h"
18 #include "base/native_library.h"
19 #include "base/numerics/safe_conversions.h"
20 #include "base/power_monitor/power_monitor.h"
21 #include "base/process/process.h"
22 #include "base/strings/string_number_conversions.h"
23 #include "base/system/sys_info.h"
24 #include "base/task/current_thread.h"
25 #include "base/threading/platform_thread.h"
26 #include "base/threading/thread_task_runner_handle.h"
27 #include "base/time/time.h"
28 #include "build/build_config.h"
29 #include "gpu/config/gpu_crash_keys.h"
30 #include "gpu/config/gpu_finch_features.h"
31 #include "gpu/ipc/common/result_codes.h"
32 
33 #if defined(OS_WIN)
34 #include "base/win/windows_version.h"
35 #endif
36 
37 namespace gpu {
38 #if defined(OS_WIN)
GetGpuWatchdogTimeoutBasedOnCpuCores()39 base::TimeDelta GetGpuWatchdogTimeoutBasedOnCpuCores() {
40   if (base::win::GetVersion() >= base::win::Version::WIN10) {
41     int num_of_processors = base::SysInfo::NumberOfProcessors();
42 
43     if (num_of_processors > 8)
44       return (kGpuWatchdogTimeout - base::TimeDelta::FromSeconds(10));
45     else if (num_of_processors <= 4)
46       return kGpuWatchdogTimeout + base::TimeDelta::FromSeconds(5);
47   }
48 
49   return kGpuWatchdogTimeout;
50 }
51 #endif
52 
GpuWatchdogThread(base::TimeDelta timeout,int init_factor,int restart_factor,int max_extra_cycles_before_kill,bool is_test_mode)53 GpuWatchdogThread::GpuWatchdogThread(base::TimeDelta timeout,
54                                      int init_factor,
55                                      int restart_factor,
56                                      int max_extra_cycles_before_kill,
57                                      bool is_test_mode)
58     : base::Thread("GpuWatchdog"),
59       watchdog_timeout_(timeout),
60       watchdog_init_factor_(init_factor),
61       watchdog_restart_factor_(restart_factor),
62       in_gpu_initialization_(true),
63       max_extra_cycles_before_kill_(max_extra_cycles_before_kill),
64       is_test_mode_(is_test_mode),
65       watched_gpu_task_runner_(base::ThreadTaskRunnerHandle::Get()) {
66   base::CurrentThread::Get()->AddTaskObserver(this);
67   num_of_processors_ = base::SysInfo::NumberOfProcessors();
68 
69 #if defined(OS_WIN)
70   // GetCurrentThread returns a pseudo-handle that cannot be used by one thread
71   // to identify another. DuplicateHandle creates a "real" handle that can be
72   // used for this purpose.
73   if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),
74                        GetCurrentProcess(), &watched_thread_handle_,
75                        THREAD_QUERY_INFORMATION, FALSE, 0)) {
76     watched_thread_handle_ = nullptr;
77   }
78 #endif
79 
80 #if defined(USE_X11) && !defined(OS_BSD)
81   tty_file_ = base::OpenFile(
82       base::FilePath(FILE_PATH_LITERAL("/sys/class/tty/tty0/active")), "r");
83   UpdateActiveTTY();
84   host_tty_ = active_tty_;
85 #endif
86 
87   Arm();
88 }
89 
~GpuWatchdogThread()90 GpuWatchdogThread::~GpuWatchdogThread() {
91   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
92   // Stop() might take too long and the watchdog timeout is triggered.
93   // Disarm first before calling Stop() to avoid a crash.
94   if (IsArmed())
95     Disarm();
96   PauseWatchdog();
97 
98   Stop();  // stop the watchdog thread
99 
100   base::CurrentThread::Get()->RemoveTaskObserver(this);
101   base::PowerMonitor::RemoveObserver(this);
102   GpuWatchdogHistogram(GpuWatchdogThreadEvent::kGpuWatchdogEnd);
103 #if defined(OS_WIN)
104   if (watched_thread_handle_)
105     CloseHandle(watched_thread_handle_);
106 #endif
107 
108 #if defined(USE_X11) && !defined(OS_BSD)
109   if (tty_file_)
110     fclose(tty_file_);
111 #endif
112 }
113 
114 // static
Create(bool start_backgrounded,base::TimeDelta timeout,int init_factor,int restart_factor,int max_extra_cycles_before_kill,bool is_test_mode)115 std::unique_ptr<GpuWatchdogThread> GpuWatchdogThread::Create(
116     bool start_backgrounded,
117     base::TimeDelta timeout,
118     int init_factor,
119     int restart_factor,
120     int max_extra_cycles_before_kill,
121     bool is_test_mode) {
122   auto watchdog_thread = base::WrapUnique(
123       new GpuWatchdogThread(timeout, init_factor, restart_factor,
124                             max_extra_cycles_before_kill, is_test_mode));
125   base::Thread::Options options;
126   options.timer_slack = base::TIMER_SLACK_MAXIMUM;
127   watchdog_thread->StartWithOptions(options);
128   if (start_backgrounded)
129     watchdog_thread->OnBackgrounded();
130   return watchdog_thread;
131 }
132 
133 // static
Create(bool start_backgrounded)134 std::unique_ptr<GpuWatchdogThread> GpuWatchdogThread::Create(
135     bool start_backgrounded) {
136   base::TimeDelta gpu_watchdog_timeout = kGpuWatchdogTimeout;
137   int init_factor = kInitFactor;
138   int restart_factor = kRestartFactor;
139   int max_extra_cycles_before_kill = kMaxExtraCyclesBeforeKill;
140 
141 #if defined(OS_WIN)
142   gpu_watchdog_timeout = GetGpuWatchdogTimeoutBasedOnCpuCores();
143 #endif
144 
145   if (base::FeatureList::IsEnabled(features::kGpuWatchdogV2NewTimeout)) {
146     const char kNewTimeOutParam[] = "new_time_out";
147     const char kMaxExtraCyclesBeforeKillParam[] =
148         "max_extra_cycles_before_kill";
149 
150 #if defined(OS_WIN)
151     constexpr int kFinchMaxExtraCyclesBeforeKill = 0;
152 #elif defined(OS_ANDROID)
153     constexpr int kFinchMaxExtraCyclesBeforeKill = 0;
154     init_factor = kInitFactorFinch;
155     restart_factor = kRestartFactorFinch;
156 #elif defined(OS_MAC)
157     constexpr int kFinchMaxExtraCyclesBeforeKill = 1;
158 #else
159     constexpr int kFinchMaxExtraCyclesBeforeKill = 2;
160 #endif
161 
162     int timeout = base::GetFieldTrialParamByFeatureAsInt(
163         features::kGpuWatchdogV2NewTimeout, kNewTimeOutParam,
164         gpu_watchdog_timeout.InSeconds());
165     gpu_watchdog_timeout = base::TimeDelta::FromSeconds(timeout);
166 
167     max_extra_cycles_before_kill = base::GetFieldTrialParamByFeatureAsInt(
168         features::kGpuWatchdogV2NewTimeout, kMaxExtraCyclesBeforeKillParam,
169         kFinchMaxExtraCyclesBeforeKill);
170   }
171 
172   return Create(start_backgrounded, gpu_watchdog_timeout, init_factor,
173                 restart_factor, max_extra_cycles_before_kill, false);
174 }
175 
176 // Do not add power observer during watchdog init, PowerMonitor might not be up
177 // running yet.
AddPowerObserver()178 void GpuWatchdogThread::AddPowerObserver() {
179   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
180 
181   // Forward it to the watchdog thread. Call PowerMonitor::AddObserver on the
182   // watchdog thread so that OnSuspend and OnResume will be called on watchdog
183   // thread.
184   is_add_power_observer_called_ = true;
185   task_runner()->PostTask(FROM_HERE,
186                           base::BindOnce(&GpuWatchdogThread::OnAddPowerObserver,
187                                          base::Unretained(this)));
188 }
189 
190 // Android Chrome goes to the background. Called from the gpu thread.
OnBackgrounded()191 void GpuWatchdogThread::OnBackgrounded() {
192   task_runner()->PostTask(
193       FROM_HERE,
194       base::BindOnce(&GpuWatchdogThread::StopWatchdogTimeoutTask,
195                      base::Unretained(this), kAndroidBackgroundForeground));
196 }
197 
198 // Android Chrome goes to the foreground. Called from the gpu thread.
OnForegrounded()199 void GpuWatchdogThread::OnForegrounded() {
200   task_runner()->PostTask(
201       FROM_HERE,
202       base::BindOnce(&GpuWatchdogThread::RestartWatchdogTimeoutTask,
203                      base::Unretained(this), kAndroidBackgroundForeground));
204 }
205 
206 // Called from the gpu thread when gpu init has completed.
OnInitComplete()207 void GpuWatchdogThread::OnInitComplete() {
208   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
209 
210   task_runner()->PostTask(
211       FROM_HERE, base::BindOnce(&GpuWatchdogThread::UpdateInitializationFlag,
212                                 base::Unretained(this)));
213   Disarm();
214 }
215 
216 // Called from the gpu thread in viz::GpuServiceImpl::~GpuServiceImpl().
217 // After this, no Disarm() will be called before the watchdog thread is
218 // destroyed. If this destruction takes too long, the watchdog timeout
219 // will be triggered.
OnGpuProcessTearDown()220 void GpuWatchdogThread::OnGpuProcessTearDown() {
221   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
222 
223   in_gpu_process_teardown_ = true;
224   if (!IsArmed())
225     Arm();
226 }
227 
228 // Called from the gpu main thread.
PauseWatchdog()229 void GpuWatchdogThread::PauseWatchdog() {
230   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
231 
232   task_runner()->PostTask(
233       FROM_HERE, base::BindOnce(&GpuWatchdogThread::StopWatchdogTimeoutTask,
234                                 base::Unretained(this), kGeneralGpuFlow));
235 }
236 
237 // Called from the gpu main thread.
ResumeWatchdog()238 void GpuWatchdogThread::ResumeWatchdog() {
239   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
240 
241   task_runner()->PostTask(
242       FROM_HERE, base::BindOnce(&GpuWatchdogThread::RestartWatchdogTimeoutTask,
243                                 base::Unretained(this), kGeneralGpuFlow));
244 }
245 
246 // Running on the watchdog thread.
247 // On Linux, Init() will be called twice for Sandbox Initialization. The
248 // watchdog is stopped and then restarted in StartSandboxLinux(). Everything
249 // should be the same and continue after the second init().
Init()250 void GpuWatchdogThread::Init() {
251   watchdog_thread_task_runner_ = base::ThreadTaskRunnerHandle::Get();
252 
253   // Get and Invalidate weak_ptr should be done on the watchdog thread only.
254   weak_ptr_ = weak_factory_.GetWeakPtr();
255   base::TimeDelta timeout = watchdog_timeout_ * kInitFactor;
256   task_runner()->PostDelayedTask(
257       FROM_HERE,
258       base::BindOnce(&GpuWatchdogThread::OnWatchdogTimeout, weak_ptr_),
259       timeout);
260 
261   last_arm_disarm_counter_ = ReadArmDisarmCounter();
262   watchdog_start_timeticks_ = base::TimeTicks::Now();
263   last_on_watchdog_timeout_timeticks_ = watchdog_start_timeticks_;
264   next_on_watchdog_timeout_time_ = base::Time::Now() + timeout;
265 
266 #if defined(OS_WIN)
267   if (watched_thread_handle_) {
268     if (base::ThreadTicks::IsSupported())
269       base::ThreadTicks::WaitUntilInitialized();
270     last_on_watchdog_timeout_thread_ticks_ = GetWatchedThreadTime();
271     remaining_watched_thread_ticks_ = timeout;
272   }
273 #endif
274 }
275 
276 // Running on the watchdog thread.
CleanUp()277 void GpuWatchdogThread::CleanUp() {
278   DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
279   weak_factory_.InvalidateWeakPtrs();
280 }
281 
ReportProgress()282 void GpuWatchdogThread::ReportProgress() {
283   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
284   InProgress();
285 }
286 
WillProcessTask(const base::PendingTask & pending_task,bool was_blocked_or_low_priority)287 void GpuWatchdogThread::WillProcessTask(const base::PendingTask& pending_task,
288                                         bool was_blocked_or_low_priority) {
289   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
290 
291   // The watchdog is armed at the beginning of the gpu process teardown.
292   // Do not call Arm() during teardown.
293   if (in_gpu_process_teardown_)
294     DCHECK(IsArmed());
295   else
296     Arm();
297 }
298 
DidProcessTask(const base::PendingTask & pending_task)299 void GpuWatchdogThread::DidProcessTask(const base::PendingTask& pending_task) {
300   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
301 
302   // Keep the watchdog armed during tear down.
303   if (in_gpu_process_teardown_)
304     InProgress();
305   else
306     Disarm();
307 }
308 
309 // Power Suspends. Running on the watchdog thread.
OnSuspend()310 void GpuWatchdogThread::OnSuspend() {
311   StopWatchdogTimeoutTask(kPowerSuspendResume);
312 }
313 
314 // Power Resumes. Running on the watchdog thread.
OnResume()315 void GpuWatchdogThread::OnResume() {
316   RestartWatchdogTimeoutTask(kPowerSuspendResume);
317 }
318 
319 // Running on the watchdog thread.
OnAddPowerObserver()320 void GpuWatchdogThread::OnAddPowerObserver() {
321   DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
322   DCHECK(base::PowerMonitor::IsInitialized());
323 
324   base::PowerMonitor::AddObserver(this);
325   is_power_observer_added_ = true;
326 }
327 
328 // Running on the watchdog thread.
RestartWatchdogTimeoutTask(PauseResumeSource source_of_request)329 void GpuWatchdogThread::RestartWatchdogTimeoutTask(
330     PauseResumeSource source_of_request) {
331   DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
332   base::TimeDelta timeout;
333 
334   switch (source_of_request) {
335     case kAndroidBackgroundForeground:
336       if (!is_backgrounded_)
337         return;
338       is_backgrounded_ = false;
339       timeout = watchdog_timeout_ * watchdog_restart_factor_;
340       foregrounded_timeticks_ = base::TimeTicks::Now();
341       foregrounded_event_ = true;
342       num_of_timeout_after_foregrounded_ = 0;
343       break;
344     case kPowerSuspendResume:
345       if (!in_power_suspension_)
346         return;
347       in_power_suspension_ = false;
348       timeout = watchdog_timeout_ * watchdog_restart_factor_;
349       power_resume_timeticks_ = base::TimeTicks::Now();
350       power_resumed_event_ = true;
351       num_of_timeout_after_power_resume_ = 0;
352       break;
353     case kGeneralGpuFlow:
354       if (!is_paused_)
355         return;
356       is_paused_ = false;
357       timeout = watchdog_timeout_ * watchdog_init_factor_;
358       watchdog_resume_timeticks_ = base::TimeTicks::Now();
359       break;
360   }
361 
362   if (!is_backgrounded_ && !in_power_suspension_ && !is_paused_) {
363     weak_ptr_ = weak_factory_.GetWeakPtr();
364     task_runner()->PostDelayedTask(
365         FROM_HERE,
366         base::BindOnce(&GpuWatchdogThread::OnWatchdogTimeout, weak_ptr_),
367         timeout);
368     last_on_watchdog_timeout_timeticks_ = base::TimeTicks::Now();
369     next_on_watchdog_timeout_time_ = base::Time::Now() + timeout;
370     last_arm_disarm_counter_ = ReadArmDisarmCounter();
371 #if defined(OS_WIN)
372     if (watched_thread_handle_) {
373       last_on_watchdog_timeout_thread_ticks_ = GetWatchedThreadTime();
374       remaining_watched_thread_ticks_ = timeout;
375     }
376 #endif
377   }
378 }
379 
StopWatchdogTimeoutTask(PauseResumeSource source_of_request)380 void GpuWatchdogThread::StopWatchdogTimeoutTask(
381     PauseResumeSource source_of_request) {
382   DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
383 
384   switch (source_of_request) {
385     case kAndroidBackgroundForeground:
386       if (is_backgrounded_)
387         return;
388       is_backgrounded_ = true;
389       backgrounded_timeticks_ = base::TimeTicks::Now();
390       foregrounded_event_ = false;
391       break;
392     case kPowerSuspendResume:
393       if (in_power_suspension_)
394         return;
395       in_power_suspension_ = true;
396       power_suspend_timeticks_ = base::TimeTicks::Now();
397       power_resumed_event_ = false;
398       break;
399     case kGeneralGpuFlow:
400       if (is_paused_)
401         return;
402       is_paused_ = true;
403       watchdog_pause_timeticks_ = base::TimeTicks::Now();
404       break;
405   }
406 
407   // Revoke any pending watchdog timeout task
408   weak_factory_.InvalidateWeakPtrs();
409 }
410 
UpdateInitializationFlag()411 void GpuWatchdogThread::UpdateInitializationFlag() {
412   in_gpu_initialization_ = false;
413 }
414 
415 // Called from the gpu main thread.
416 // The watchdog is armed only in these three functions -
417 // GpuWatchdogThread(), WillProcessTask(), and OnGpuProcessTearDown()
Arm()418 void GpuWatchdogThread::Arm() {
419   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
420 
421   base::subtle::NoBarrier_AtomicIncrement(&arm_disarm_counter_, 1);
422 
423   // Arm/Disarm are always called in sequence. Now it's an odd number.
424   DCHECK(IsArmed());
425 }
426 
Disarm()427 void GpuWatchdogThread::Disarm() {
428   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
429 
430   base::subtle::NoBarrier_AtomicIncrement(&arm_disarm_counter_, 1);
431 
432   // Arm/Disarm are always called in sequence. Now it's an even number.
433   DCHECK(!IsArmed());
434 }
435 
InProgress()436 void GpuWatchdogThread::InProgress() {
437   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
438 
439   // Increment by 2. This is equivalent to Disarm() + Arm().
440   base::subtle::NoBarrier_AtomicIncrement(&arm_disarm_counter_, 2);
441 
442   // Now it's an odd number.
443   DCHECK(IsArmed());
444 }
445 
IsArmed()446 bool GpuWatchdogThread::IsArmed() {
447   // It's an odd number.
448   return base::subtle::NoBarrier_Load(&arm_disarm_counter_) & 1;
449 }
450 
ReadArmDisarmCounter()451 base::subtle::Atomic32 GpuWatchdogThread::ReadArmDisarmCounter() {
452   return base::subtle::NoBarrier_Load(&arm_disarm_counter_);
453 }
454 
455 // Running on the watchdog thread.
OnWatchdogTimeout()456 void GpuWatchdogThread::OnWatchdogTimeout() {
457   DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
458   DCHECK(!is_backgrounded_);
459   DCHECK(!in_power_suspension_);
460   DCHECK(!is_paused_);
461 
462   // If this metric is added too early (eg. watchdog creation time), it cannot
463   // be persistent. The histogram data will be lost after crash or browser exit.
464   // Delay the recording of kGpuWatchdogStart until the firs
465   // OnWatchdogTimeout() to ensure this metric is created in the persistent
466   // memory.
467   if (!is_watchdog_start_histogram_recorded) {
468     is_watchdog_start_histogram_recorded = true;
469     GpuWatchdogHistogram(GpuWatchdogThreadEvent::kGpuWatchdogStart);
470   }
471 
472   auto arm_disarm_counter = ReadArmDisarmCounter();
473   GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kTimeout);
474   if (power_resumed_event_)
475     num_of_timeout_after_power_resume_++;
476   if (foregrounded_event_)
477     num_of_timeout_after_foregrounded_++;
478 
479 #if defined(USE_X11) && !defined(OS_BSD)
480   UpdateActiveTTY();
481 #endif
482 
483   // Collect all needed info for gpu hang detection.
484   bool disarmed = arm_disarm_counter % 2 == 0;  // even number
485   bool gpu_makes_progress = arm_disarm_counter != last_arm_disarm_counter_;
486   bool no_gpu_hang = disarmed || gpu_makes_progress || SlowWatchdogThread();
487 
488   bool watched_thread_needs_more_time =
489       WatchedThreadNeedsMoreThreadTime(no_gpu_hang);
490   no_gpu_hang = no_gpu_hang || watched_thread_needs_more_time ||
491                 ContinueOnNonHostX11ServerTty();
492 
493   bool allows_extra_timeout = WatchedThreadGetsExtraTimeout(no_gpu_hang);
494   no_gpu_hang = no_gpu_hang || allows_extra_timeout;
495 
496   // No gpu hang. Continue with another OnWatchdogTimeout task.
497   if (no_gpu_hang) {
498     last_on_watchdog_timeout_timeticks_ = base::TimeTicks::Now();
499     next_on_watchdog_timeout_time_ = base::Time::Now() + watchdog_timeout_;
500     last_arm_disarm_counter_ = ReadArmDisarmCounter();
501 
502     task_runner()->PostDelayedTask(
503         FROM_HERE,
504         base::BindOnce(&GpuWatchdogThread::OnWatchdogTimeout, weak_ptr_),
505         watchdog_timeout_);
506     return;
507   }
508 
509   // Still armed without any progress. GPU possibly hangs.
510   GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kKill);
511 #if defined(OS_WIN)
512   if (less_than_full_thread_time_after_capped_)
513     GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kKillOnLessThreadTime);
514 #endif
515 
516   DeliberatelyTerminateToRecoverFromHang();
517 }
518 
SlowWatchdogThread()519 bool GpuWatchdogThread::SlowWatchdogThread() {
520   // If it takes 15 more seconds than the expected time between two
521   // OnWatchdogTimeout() calls, the system is considered slow and it's not a GPU
522   // hang.
523   bool slow_watchdog_thread =
524       (base::Time::Now() - next_on_watchdog_timeout_time_) >=
525       base::TimeDelta::FromSeconds(15);
526 
527   // Record this case only when a GPU hang is detected and the thread is slow.
528   if (slow_watchdog_thread)
529     GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kSlowWatchdogThread);
530 
531   return slow_watchdog_thread;
532 }
533 
WatchedThreadNeedsMoreThreadTime(bool no_gpu_hang_detected)534 bool GpuWatchdogThread::WatchedThreadNeedsMoreThreadTime(
535     bool no_gpu_hang_detected) {
536 #if defined(OS_WIN)
537   if (!watched_thread_handle_)
538     return false;
539 
540   // We allow extra thread time. When that runs out, we extend extra timeout
541   // cycles. Now, we are extending extra timeout cycles. Don't add extra thread
542   // time.
543   if (count_of_extra_cycles_ > 0)
544     return false;
545 
546   WatchedThreadNeedsMoreThreadTimeHistogram(
547       no_gpu_hang_detected,
548       /*start_of_more_thread_time*/ false);
549 
550   if (!no_gpu_hang_detected && count_of_more_gpu_thread_time_allowed_ >=
551                                    kMaxCountOfMoreGpuThreadTimeAllowed) {
552     less_than_full_thread_time_after_capped_ = true;
553   } else {
554     less_than_full_thread_time_after_capped_ = false;
555   }
556 
557   // Calculate how many thread ticks the watched thread spent doing the work.
558   base::ThreadTicks now = GetWatchedThreadTime();
559   base::TimeDelta thread_time_elapsed =
560       now - last_on_watchdog_timeout_thread_ticks_;
561   last_on_watchdog_timeout_thread_ticks_ = now;
562   remaining_watched_thread_ticks_ -= thread_time_elapsed;
563 
564   if (no_gpu_hang_detected ||
565       count_of_more_gpu_thread_time_allowed_ >=
566           kMaxCountOfMoreGpuThreadTimeAllowed ||
567       thread_time_elapsed < base::TimeDelta() /* bogus data */ ||
568       remaining_watched_thread_ticks_ <= base::TimeDelta()) {
569     // Reset the remaining thread ticks.
570     remaining_watched_thread_ticks_ = watchdog_timeout_;
571     count_of_more_gpu_thread_time_allowed_ = 0;
572 
573     return false;
574   } else {
575     // This is the start of allowing more thread time.
576     if (count_of_more_gpu_thread_time_allowed_ == 0) {
577       WatchedThreadNeedsMoreThreadTimeHistogram(
578           no_gpu_hang_detected, /*start_of_more_thread_time*/ true);
579     }
580     count_of_more_gpu_thread_time_allowed_++;
581 
582     return true;
583   }
584 #else
585   return false;
586 #endif
587 }
588 
589 #if defined(OS_WIN)
GetWatchedThreadTime()590 base::ThreadTicks GpuWatchdogThread::GetWatchedThreadTime() {
591   DCHECK(watched_thread_handle_);
592 
593   if (base::ThreadTicks::IsSupported()) {
594     // Note: GetForThread() might return bogus results if running on different
595     // CPUs between two calls.
596     return base::ThreadTicks::GetForThread(
597         base::PlatformThreadHandle(watched_thread_handle_));
598   } else {
599     FILETIME creation_time;
600     FILETIME exit_time;
601     FILETIME kernel_time;
602     FILETIME user_time;
603     BOOL result = GetThreadTimes(watched_thread_handle_, &creation_time,
604                                  &exit_time, &kernel_time, &user_time);
605     if (!result)
606       return base::ThreadTicks();
607 
608     // Need to bit_cast to fix alignment, then divide by 10 to convert
609     // 100-nanoseconds to microseconds.
610     int64_t user_time_us = bit_cast<int64_t, FILETIME>(user_time) / 10;
611     int64_t kernel_time_us = bit_cast<int64_t, FILETIME>(kernel_time) / 10;
612 
613     return base::ThreadTicks() +
614            base::TimeDelta::FromMicroseconds(user_time_us + kernel_time_us);
615   }
616 }
617 #endif
618 
WatchedThreadGetsExtraTimeout(bool no_gpu_hang)619 bool GpuWatchdogThread::WatchedThreadGetsExtraTimeout(bool no_gpu_hang) {
620   if (max_extra_cycles_before_kill_ == 0)
621     return false;
622 
623   // We want to record histograms even if there is no gpu hang.
624   bool allows_more_timeouts = false;
625   WatchedThreadGetsExtraTimeoutHistogram(no_gpu_hang);
626 
627   if (no_gpu_hang) {
628     if (count_of_extra_cycles_ > 0) {
629       count_of_extra_cycles_ = 0;
630     }
631   } else if (count_of_extra_cycles_ < max_extra_cycles_before_kill_) {
632     count_of_extra_cycles_++;
633     allows_more_timeouts = true;
634   }
635 
636   return allows_more_timeouts;
637 }
638 
DeliberatelyTerminateToRecoverFromHang()639 void GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang() {
640   DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
641   // If this is for gpu testing, do not terminate the gpu process.
642   if (is_test_mode_) {
643     test_result_timeout_and_gpu_hang_.Set();
644     return;
645   }
646 
647 #if defined(OS_WIN)
648   if (IsDebuggerPresent())
649     return;
650 #endif
651 
652   // Store variables so they're available in crash dumps to help determine the
653   // cause of any hang.
654   base::TimeTicks function_begin_timeticks = base::TimeTicks::Now();
655   base::debug::Alias(&in_gpu_initialization_);
656   base::debug::Alias(&num_of_timeout_after_power_resume_);
657   base::debug::Alias(&num_of_timeout_after_foregrounded_);
658   base::debug::Alias(&function_begin_timeticks);
659   base::debug::Alias(&watchdog_start_timeticks_);
660   base::debug::Alias(&power_suspend_timeticks_);
661   base::debug::Alias(&power_resume_timeticks_);
662   base::debug::Alias(&backgrounded_timeticks_);
663   base::debug::Alias(&foregrounded_timeticks_);
664   base::debug::Alias(&watchdog_pause_timeticks_);
665   base::debug::Alias(&watchdog_resume_timeticks_);
666   base::debug::Alias(&in_power_suspension_);
667   base::debug::Alias(&in_gpu_process_teardown_);
668   base::debug::Alias(&is_backgrounded_);
669   base::debug::Alias(&is_add_power_observer_called_);
670   base::debug::Alias(&is_power_observer_added_);
671   base::debug::Alias(&last_on_watchdog_timeout_timeticks_);
672   base::TimeDelta timeticks_elapses =
673       function_begin_timeticks - last_on_watchdog_timeout_timeticks_;
674   base::debug::Alias(&timeticks_elapses);
675   base::debug::Alias(&max_extra_cycles_before_kill_);
676 #if defined(OS_WIN)
677   base::debug::Alias(&remaining_watched_thread_ticks_);
678   base::debug::Alias(&less_than_full_thread_time_after_capped_);
679 #endif
680 
681   GpuWatchdogHistogram(GpuWatchdogThreadEvent::kGpuWatchdogKill);
682 
683   crash_keys::gpu_watchdog_crashed_in_gpu_init.Set(
684       in_gpu_initialization_ ? "1" : "0");
685 
686   crash_keys::gpu_watchdog_kill_after_power_resume.Set(
687       WithinOneMinFromPowerResumed() ? "1" : "0");
688 
689   crash_keys::num_of_processors.Set(base::NumberToString(num_of_processors_));
690 
691   // Check the arm_disarm_counter value one more time.
692   auto last_arm_disarm_counter = ReadArmDisarmCounter();
693   base::debug::Alias(&last_arm_disarm_counter);
694 
695   // Use RESULT_CODE_HUNG so this crash is separated from other
696   // EXCEPTION_ACCESS_VIOLATION buckets for UMA analysis.
697   // Create a crash dump first. TerminateCurrentProcessImmediately will not
698   // create a dump.
699   base::debug::DumpWithoutCrashing();
700   base::Process::TerminateCurrentProcessImmediately(RESULT_CODE_HUNG);
701 }
702 
GpuWatchdogHistogram(GpuWatchdogThreadEvent thread_event)703 void GpuWatchdogThread::GpuWatchdogHistogram(
704     GpuWatchdogThreadEvent thread_event) {
705   base::UmaHistogramEnumeration("GPU.WatchdogThread.Event", thread_event);
706 }
707 
GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent timeout_event)708 void GpuWatchdogThread::GpuWatchdogTimeoutHistogram(
709     GpuWatchdogTimeoutEvent timeout_event) {
710   base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout", timeout_event);
711 
712   bool recorded = false;
713   if (in_gpu_initialization_) {
714     base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.Init",
715                                   timeout_event);
716     recorded = true;
717   }
718 
719   if (WithinOneMinFromPowerResumed()) {
720     base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.PowerResume",
721                                   timeout_event);
722     recorded = true;
723   }
724 
725   if (WithinOneMinFromForegrounded()) {
726     base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.Foregrounded",
727                                   timeout_event);
728     recorded = true;
729   }
730 
731   if (!recorded) {
732     base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.Normal",
733                                   timeout_event);
734   }
735 }
736 
737 #if defined(OS_WIN)
RecordExtraThreadTimeHistogram()738 void GpuWatchdogThread::RecordExtraThreadTimeHistogram() {
739   // Record the number of timeouts the GPU main thread needs to make a progress
740   // after GPU OnWatchdogTimeout() is triggered. The maximum count is 6 which
741   // is more  than kMaxCountOfMoreGpuThreadTimeAllowed(4);
742   constexpr int kMin = 1;
743   constexpr int kMax = 6;
744   constexpr int kBuckets = 6;
745   int count = count_of_more_gpu_thread_time_allowed_;
746   bool recorded = false;
747 
748   base::UmaHistogramCustomCounts("GPU.WatchdogThread.ExtraThreadTime", count,
749                                  kMin, kMax, kBuckets);
750 
751   if (in_gpu_initialization_) {
752     base::UmaHistogramCustomCounts("GPU.WatchdogThread.ExtraThreadTime.Init",
753                                    count, kMin, kMax, kBuckets);
754     recorded = true;
755   }
756 
757   if (WithinOneMinFromPowerResumed()) {
758     base::UmaHistogramCustomCounts(
759         "GPU.WatchdogThread.ExtraThreadTime.PowerResume", count, kMin, kMax,
760         kBuckets);
761     recorded = true;
762   }
763 
764   if (WithinOneMinFromForegrounded()) {
765     base::UmaHistogramCustomCounts(
766         "GPU.WatchdogThread.ExtraThreadTime.Foregrounded", count, kMin, kMax,
767         kBuckets);
768     recorded = true;
769   }
770 
771   if (!recorded) {
772     base::UmaHistogramCustomCounts("GPU.WatchdogThread.ExtraThreadTime.Normal",
773                                    count, kMin, kMax, kBuckets);
774   }
775 }
776 
RecordNumOfUsersWaitingWithExtraThreadTimeHistogram(int count)777 void GpuWatchdogThread::RecordNumOfUsersWaitingWithExtraThreadTimeHistogram(
778     int count) {
779   constexpr int kMax = 4;
780 
781   base::UmaHistogramExactLinear("GPU.WatchdogThread.ExtraThreadTime.NumOfUsers",
782                                 count, kMax);
783 }
784 
WatchedThreadNeedsMoreThreadTimeHistogram(bool no_gpu_hang_detected,bool start_of_more_thread_time)785 void GpuWatchdogThread::WatchedThreadNeedsMoreThreadTimeHistogram(
786     bool no_gpu_hang_detected,
787     bool start_of_more_thread_time) {
788   if (start_of_more_thread_time) {
789     // This is the start of allowing more thread time. Only record it once for
790     // all following timeouts on the same detected gpu hang, so we know this
791     // is equivlent one crash in our crash reports.
792     GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kMoreThreadTime);
793     RecordNumOfUsersWaitingWithExtraThreadTimeHistogram(0);
794   } else {
795     if (count_of_more_gpu_thread_time_allowed_ > 0) {
796       if (no_gpu_hang_detected) {
797         // If count_of_more_gpu_thread_time_allowed_ > 0, we know extra time was
798         // extended in the previous OnWatchdogTimeout(). Now we find gpu makes
799         // progress. Record this case.
800         GpuWatchdogTimeoutHistogram(
801             GpuWatchdogTimeoutEvent::kProgressAfterMoreThreadTime);
802         RecordExtraThreadTimeHistogram();
803       } else {
804         if (count_of_more_gpu_thread_time_allowed_ >=
805             kMaxCountOfMoreGpuThreadTimeAllowed) {
806           GpuWatchdogTimeoutHistogram(
807               GpuWatchdogTimeoutEvent::kLessThanFullThreadTimeAfterCapped);
808         }
809       }
810 
811       // Records the number of users who are still waiting. We can use this
812       // number to calculate the number of users who had already quit.
813       RecordNumOfUsersWaitingWithExtraThreadTimeHistogram(
814           count_of_more_gpu_thread_time_allowed_);
815 
816       // Used by GPU.WatchdogThread.WaitTime later
817       time_in_wait_for_full_thread_time_ =
818           count_of_more_gpu_thread_time_allowed_ * watchdog_timeout_;
819     }
820   }
821 }
822 #endif
823 
WatchedThreadGetsExtraTimeoutHistogram(bool no_gpu_hang)824 void GpuWatchdogThread::WatchedThreadGetsExtraTimeoutHistogram(
825     bool no_gpu_hang) {
826   constexpr int kMax = 60;
827   if (count_of_extra_cycles_ == 0 && !no_gpu_hang) {
828     GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kTimeoutWait);
829     base::UmaHistogramExactLinear("GPU.WatchdogThread.WaitTime.NumOfUsers", 0,
830                                   kMax);
831   } else if (count_of_extra_cycles_ > 0) {
832     int count = watchdog_timeout_.InSeconds() * count_of_extra_cycles_;
833     base::UmaHistogramExactLinear("GPU.WatchdogThread.WaitTime.NumOfUsers",
834                                   count, kMax);
835     if (no_gpu_hang) {
836       GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kProgressAfterWait);
837       base::UmaHistogramExactLinear(
838           "GPU.WatchdogThread.WaitTime.ProgressAfterWait", count, kMax);
839 
840 #if defined(OS_WIN)
841       // Add the time the GPU thread was given for the full thread time up to 60
842       // seconds. GPU.WatchdogThread.WaitTime is essentially equal to
843       // GPU.WatchdogThread.WaitTime.ProgressAfterWait on non-Windows systems.
844       base::TimeDelta wait_time = base::TimeDelta::FromSeconds(count);
845       wait_time += time_in_wait_for_full_thread_time_;
846 
847       constexpr base::TimeDelta kMinTime = base::TimeDelta::FromSeconds(1);
848       constexpr base::TimeDelta kMaxTime = base::TimeDelta::FromSeconds(150);
849       constexpr int kBuckets = 50;
850 
851       // The time the GPU main thread takes to finish a task after a "hang" is
852       // dectedted.
853       base::UmaHistogramCustomTimes("GPU.WatchdogThread.WaitTime", wait_time,
854                                     kMinTime, kMaxTime, kBuckets);
855 #endif
856     }
857   }
858 }
859 
WithinOneMinFromPowerResumed()860 bool GpuWatchdogThread::WithinOneMinFromPowerResumed() {
861   size_t count = base::ClampFloor<size_t>(base::TimeDelta::FromMinutes(1) /
862                                           watchdog_timeout_);
863   return power_resumed_event_ && num_of_timeout_after_power_resume_ <= count;
864 }
865 
WithinOneMinFromForegrounded()866 bool GpuWatchdogThread::WithinOneMinFromForegrounded() {
867   size_t count = base::ClampFloor<size_t>(base::TimeDelta::FromMinutes(1) /
868                                           watchdog_timeout_);
869   return foregrounded_event_ && num_of_timeout_after_foregrounded_ <= count;
870 }
871 
872 #if defined(USE_X11) && !defined(OS_BSD)
UpdateActiveTTY()873 void GpuWatchdogThread::UpdateActiveTTY() {
874   last_active_tty_ = active_tty_;
875 
876   active_tty_ = -1;
877   char tty_string[8] = {0};
878   if (tty_file_ && !fseek(tty_file_, 0, SEEK_SET) &&
879       fread(tty_string, 1, 7, tty_file_)) {
880     int tty_number;
881     if (sscanf(tty_string, "tty%d\n", &tty_number) == 1) {
882       active_tty_ = tty_number;
883     }
884   }
885 }
886 #endif
887 
ContinueOnNonHostX11ServerTty()888 bool GpuWatchdogThread::ContinueOnNonHostX11ServerTty() {
889 #if defined(USE_X11) && !defined(OS_BSD)
890   if (host_tty_ == -1 || active_tty_ == -1)
891     return false;
892 
893   // Don't crash if we're not on the TTY of our host X11 server.
894   if (active_tty_ != host_tty_) {
895     // Only record for the time there is a change on TTY
896     if (last_active_tty_ == active_tty_) {
897       GpuWatchdogTimeoutHistogram(
898           GpuWatchdogTimeoutEvent::kContinueOnNonHostServerTty);
899     }
900     return true;
901   }
902 #endif
903   return false;
904 }
905 
906 // For gpu testing only. Return whether a GPU hang was detected or not.
IsGpuHangDetectedForTesting()907 bool GpuWatchdogThread::IsGpuHangDetectedForTesting() {
908   DCHECK(is_test_mode_);
909   return test_result_timeout_and_gpu_hang_.IsSet();
910 }
911 
912 // This should be called on the test main thread only. It will wait until the
913 // power observer is added on the watchdog thread.
WaitForPowerObserverAddedForTesting()914 void GpuWatchdogThread::WaitForPowerObserverAddedForTesting() {
915   DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
916   DCHECK(is_add_power_observer_called_);
917 
918   // Just return if it has been added.
919   if (is_power_observer_added_)
920     return;
921 
922   base::WaitableEvent event;
923   task_runner()->PostTask(
924       FROM_HERE,
925       base::BindOnce(&base::WaitableEvent::Signal, base::Unretained(&event)));
926   event.Wait();
927 }
928 
929 }  // namespace gpu
930