1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "gpu/ipc/service/gpu_watchdog_thread.h"
6
7 #include "base/atomicops.h"
8 #include "base/bind.h"
9 #include "base/bit_cast.h"
10 #include "base/callback_helpers.h"
11 #include "base/debug/alias.h"
12 #include "base/debug/dump_without_crashing.h"
13 #include "base/files/file_path.h"
14 #include "base/files/file_util.h"
15 #include "base/memory/ptr_util.h"
16 #include "base/metrics/field_trial_params.h"
17 #include "base/metrics/histogram_functions.h"
18 #include "base/native_library.h"
19 #include "base/numerics/safe_conversions.h"
20 #include "base/power_monitor/power_monitor.h"
21 #include "base/process/process.h"
22 #include "base/strings/string_number_conversions.h"
23 #include "base/system/sys_info.h"
24 #include "base/task/current_thread.h"
25 #include "base/threading/platform_thread.h"
26 #include "base/threading/thread_task_runner_handle.h"
27 #include "base/time/time.h"
28 #include "build/build_config.h"
29 #include "gpu/config/gpu_crash_keys.h"
30 #include "gpu/config/gpu_finch_features.h"
31 #include "gpu/ipc/common/result_codes.h"
32
33 #if defined(OS_WIN)
34 #include "base/win/windows_version.h"
35 #endif
36
37 namespace gpu {
38 #if defined(OS_WIN)
GetGpuWatchdogTimeoutBasedOnCpuCores()39 base::TimeDelta GetGpuWatchdogTimeoutBasedOnCpuCores() {
40 if (base::win::GetVersion() >= base::win::Version::WIN10) {
41 int num_of_processors = base::SysInfo::NumberOfProcessors();
42
43 if (num_of_processors > 8)
44 return (kGpuWatchdogTimeout - base::TimeDelta::FromSeconds(10));
45 else if (num_of_processors <= 4)
46 return kGpuWatchdogTimeout + base::TimeDelta::FromSeconds(5);
47 }
48
49 return kGpuWatchdogTimeout;
50 }
51 #endif
52
GpuWatchdogThread(base::TimeDelta timeout,int init_factor,int restart_factor,int max_extra_cycles_before_kill,bool is_test_mode)53 GpuWatchdogThread::GpuWatchdogThread(base::TimeDelta timeout,
54 int init_factor,
55 int restart_factor,
56 int max_extra_cycles_before_kill,
57 bool is_test_mode)
58 : base::Thread("GpuWatchdog"),
59 watchdog_timeout_(timeout),
60 watchdog_init_factor_(init_factor),
61 watchdog_restart_factor_(restart_factor),
62 in_gpu_initialization_(true),
63 max_extra_cycles_before_kill_(max_extra_cycles_before_kill),
64 is_test_mode_(is_test_mode),
65 watched_gpu_task_runner_(base::ThreadTaskRunnerHandle::Get()) {
66 base::CurrentThread::Get()->AddTaskObserver(this);
67 num_of_processors_ = base::SysInfo::NumberOfProcessors();
68
69 #if defined(OS_WIN)
70 // GetCurrentThread returns a pseudo-handle that cannot be used by one thread
71 // to identify another. DuplicateHandle creates a "real" handle that can be
72 // used for this purpose.
73 if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),
74 GetCurrentProcess(), &watched_thread_handle_,
75 THREAD_QUERY_INFORMATION, FALSE, 0)) {
76 watched_thread_handle_ = nullptr;
77 }
78 #endif
79
80 #if defined(USE_X11) && !defined(OS_BSD)
81 tty_file_ = base::OpenFile(
82 base::FilePath(FILE_PATH_LITERAL("/sys/class/tty/tty0/active")), "r");
83 UpdateActiveTTY();
84 host_tty_ = active_tty_;
85 #endif
86
87 Arm();
88 }
89
~GpuWatchdogThread()90 GpuWatchdogThread::~GpuWatchdogThread() {
91 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
92 // Stop() might take too long and the watchdog timeout is triggered.
93 // Disarm first before calling Stop() to avoid a crash.
94 if (IsArmed())
95 Disarm();
96 PauseWatchdog();
97
98 Stop(); // stop the watchdog thread
99
100 base::CurrentThread::Get()->RemoveTaskObserver(this);
101 base::PowerMonitor::RemoveObserver(this);
102 GpuWatchdogHistogram(GpuWatchdogThreadEvent::kGpuWatchdogEnd);
103 #if defined(OS_WIN)
104 if (watched_thread_handle_)
105 CloseHandle(watched_thread_handle_);
106 #endif
107
108 #if defined(USE_X11) && !defined(OS_BSD)
109 if (tty_file_)
110 fclose(tty_file_);
111 #endif
112 }
113
114 // static
Create(bool start_backgrounded,base::TimeDelta timeout,int init_factor,int restart_factor,int max_extra_cycles_before_kill,bool is_test_mode)115 std::unique_ptr<GpuWatchdogThread> GpuWatchdogThread::Create(
116 bool start_backgrounded,
117 base::TimeDelta timeout,
118 int init_factor,
119 int restart_factor,
120 int max_extra_cycles_before_kill,
121 bool is_test_mode) {
122 auto watchdog_thread = base::WrapUnique(
123 new GpuWatchdogThread(timeout, init_factor, restart_factor,
124 max_extra_cycles_before_kill, is_test_mode));
125 base::Thread::Options options;
126 options.timer_slack = base::TIMER_SLACK_MAXIMUM;
127 watchdog_thread->StartWithOptions(options);
128 if (start_backgrounded)
129 watchdog_thread->OnBackgrounded();
130 return watchdog_thread;
131 }
132
133 // static
Create(bool start_backgrounded)134 std::unique_ptr<GpuWatchdogThread> GpuWatchdogThread::Create(
135 bool start_backgrounded) {
136 base::TimeDelta gpu_watchdog_timeout = kGpuWatchdogTimeout;
137 int init_factor = kInitFactor;
138 int restart_factor = kRestartFactor;
139 int max_extra_cycles_before_kill = kMaxExtraCyclesBeforeKill;
140
141 #if defined(OS_WIN)
142 gpu_watchdog_timeout = GetGpuWatchdogTimeoutBasedOnCpuCores();
143 #endif
144
145 if (base::FeatureList::IsEnabled(features::kGpuWatchdogV2NewTimeout)) {
146 const char kNewTimeOutParam[] = "new_time_out";
147 const char kMaxExtraCyclesBeforeKillParam[] =
148 "max_extra_cycles_before_kill";
149
150 #if defined(OS_WIN)
151 constexpr int kFinchMaxExtraCyclesBeforeKill = 0;
152 #elif defined(OS_ANDROID)
153 constexpr int kFinchMaxExtraCyclesBeforeKill = 0;
154 init_factor = kInitFactorFinch;
155 restart_factor = kRestartFactorFinch;
156 #elif defined(OS_MAC)
157 constexpr int kFinchMaxExtraCyclesBeforeKill = 1;
158 #else
159 constexpr int kFinchMaxExtraCyclesBeforeKill = 2;
160 #endif
161
162 int timeout = base::GetFieldTrialParamByFeatureAsInt(
163 features::kGpuWatchdogV2NewTimeout, kNewTimeOutParam,
164 gpu_watchdog_timeout.InSeconds());
165 gpu_watchdog_timeout = base::TimeDelta::FromSeconds(timeout);
166
167 max_extra_cycles_before_kill = base::GetFieldTrialParamByFeatureAsInt(
168 features::kGpuWatchdogV2NewTimeout, kMaxExtraCyclesBeforeKillParam,
169 kFinchMaxExtraCyclesBeforeKill);
170 }
171
172 return Create(start_backgrounded, gpu_watchdog_timeout, init_factor,
173 restart_factor, max_extra_cycles_before_kill, false);
174 }
175
176 // Do not add power observer during watchdog init, PowerMonitor might not be up
177 // running yet.
AddPowerObserver()178 void GpuWatchdogThread::AddPowerObserver() {
179 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
180
181 // Forward it to the watchdog thread. Call PowerMonitor::AddObserver on the
182 // watchdog thread so that OnSuspend and OnResume will be called on watchdog
183 // thread.
184 is_add_power_observer_called_ = true;
185 task_runner()->PostTask(FROM_HERE,
186 base::BindOnce(&GpuWatchdogThread::OnAddPowerObserver,
187 base::Unretained(this)));
188 }
189
190 // Android Chrome goes to the background. Called from the gpu thread.
OnBackgrounded()191 void GpuWatchdogThread::OnBackgrounded() {
192 task_runner()->PostTask(
193 FROM_HERE,
194 base::BindOnce(&GpuWatchdogThread::StopWatchdogTimeoutTask,
195 base::Unretained(this), kAndroidBackgroundForeground));
196 }
197
198 // Android Chrome goes to the foreground. Called from the gpu thread.
OnForegrounded()199 void GpuWatchdogThread::OnForegrounded() {
200 task_runner()->PostTask(
201 FROM_HERE,
202 base::BindOnce(&GpuWatchdogThread::RestartWatchdogTimeoutTask,
203 base::Unretained(this), kAndroidBackgroundForeground));
204 }
205
206 // Called from the gpu thread when gpu init has completed.
OnInitComplete()207 void GpuWatchdogThread::OnInitComplete() {
208 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
209
210 task_runner()->PostTask(
211 FROM_HERE, base::BindOnce(&GpuWatchdogThread::UpdateInitializationFlag,
212 base::Unretained(this)));
213 Disarm();
214 }
215
216 // Called from the gpu thread in viz::GpuServiceImpl::~GpuServiceImpl().
217 // After this, no Disarm() will be called before the watchdog thread is
218 // destroyed. If this destruction takes too long, the watchdog timeout
219 // will be triggered.
OnGpuProcessTearDown()220 void GpuWatchdogThread::OnGpuProcessTearDown() {
221 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
222
223 in_gpu_process_teardown_ = true;
224 if (!IsArmed())
225 Arm();
226 }
227
228 // Called from the gpu main thread.
PauseWatchdog()229 void GpuWatchdogThread::PauseWatchdog() {
230 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
231
232 task_runner()->PostTask(
233 FROM_HERE, base::BindOnce(&GpuWatchdogThread::StopWatchdogTimeoutTask,
234 base::Unretained(this), kGeneralGpuFlow));
235 }
236
237 // Called from the gpu main thread.
ResumeWatchdog()238 void GpuWatchdogThread::ResumeWatchdog() {
239 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
240
241 task_runner()->PostTask(
242 FROM_HERE, base::BindOnce(&GpuWatchdogThread::RestartWatchdogTimeoutTask,
243 base::Unretained(this), kGeneralGpuFlow));
244 }
245
246 // Running on the watchdog thread.
247 // On Linux, Init() will be called twice for Sandbox Initialization. The
248 // watchdog is stopped and then restarted in StartSandboxLinux(). Everything
249 // should be the same and continue after the second init().
Init()250 void GpuWatchdogThread::Init() {
251 watchdog_thread_task_runner_ = base::ThreadTaskRunnerHandle::Get();
252
253 // Get and Invalidate weak_ptr should be done on the watchdog thread only.
254 weak_ptr_ = weak_factory_.GetWeakPtr();
255 base::TimeDelta timeout = watchdog_timeout_ * kInitFactor;
256 task_runner()->PostDelayedTask(
257 FROM_HERE,
258 base::BindOnce(&GpuWatchdogThread::OnWatchdogTimeout, weak_ptr_),
259 timeout);
260
261 last_arm_disarm_counter_ = ReadArmDisarmCounter();
262 watchdog_start_timeticks_ = base::TimeTicks::Now();
263 last_on_watchdog_timeout_timeticks_ = watchdog_start_timeticks_;
264 next_on_watchdog_timeout_time_ = base::Time::Now() + timeout;
265
266 #if defined(OS_WIN)
267 if (watched_thread_handle_) {
268 if (base::ThreadTicks::IsSupported())
269 base::ThreadTicks::WaitUntilInitialized();
270 last_on_watchdog_timeout_thread_ticks_ = GetWatchedThreadTime();
271 remaining_watched_thread_ticks_ = timeout;
272 }
273 #endif
274 }
275
276 // Running on the watchdog thread.
CleanUp()277 void GpuWatchdogThread::CleanUp() {
278 DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
279 weak_factory_.InvalidateWeakPtrs();
280 }
281
ReportProgress()282 void GpuWatchdogThread::ReportProgress() {
283 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
284 InProgress();
285 }
286
WillProcessTask(const base::PendingTask & pending_task,bool was_blocked_or_low_priority)287 void GpuWatchdogThread::WillProcessTask(const base::PendingTask& pending_task,
288 bool was_blocked_or_low_priority) {
289 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
290
291 // The watchdog is armed at the beginning of the gpu process teardown.
292 // Do not call Arm() during teardown.
293 if (in_gpu_process_teardown_)
294 DCHECK(IsArmed());
295 else
296 Arm();
297 }
298
DidProcessTask(const base::PendingTask & pending_task)299 void GpuWatchdogThread::DidProcessTask(const base::PendingTask& pending_task) {
300 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
301
302 // Keep the watchdog armed during tear down.
303 if (in_gpu_process_teardown_)
304 InProgress();
305 else
306 Disarm();
307 }
308
309 // Power Suspends. Running on the watchdog thread.
OnSuspend()310 void GpuWatchdogThread::OnSuspend() {
311 StopWatchdogTimeoutTask(kPowerSuspendResume);
312 }
313
314 // Power Resumes. Running on the watchdog thread.
OnResume()315 void GpuWatchdogThread::OnResume() {
316 RestartWatchdogTimeoutTask(kPowerSuspendResume);
317 }
318
319 // Running on the watchdog thread.
OnAddPowerObserver()320 void GpuWatchdogThread::OnAddPowerObserver() {
321 DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
322 DCHECK(base::PowerMonitor::IsInitialized());
323
324 base::PowerMonitor::AddObserver(this);
325 is_power_observer_added_ = true;
326 }
327
328 // Running on the watchdog thread.
RestartWatchdogTimeoutTask(PauseResumeSource source_of_request)329 void GpuWatchdogThread::RestartWatchdogTimeoutTask(
330 PauseResumeSource source_of_request) {
331 DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
332 base::TimeDelta timeout;
333
334 switch (source_of_request) {
335 case kAndroidBackgroundForeground:
336 if (!is_backgrounded_)
337 return;
338 is_backgrounded_ = false;
339 timeout = watchdog_timeout_ * watchdog_restart_factor_;
340 foregrounded_timeticks_ = base::TimeTicks::Now();
341 foregrounded_event_ = true;
342 num_of_timeout_after_foregrounded_ = 0;
343 break;
344 case kPowerSuspendResume:
345 if (!in_power_suspension_)
346 return;
347 in_power_suspension_ = false;
348 timeout = watchdog_timeout_ * watchdog_restart_factor_;
349 power_resume_timeticks_ = base::TimeTicks::Now();
350 power_resumed_event_ = true;
351 num_of_timeout_after_power_resume_ = 0;
352 break;
353 case kGeneralGpuFlow:
354 if (!is_paused_)
355 return;
356 is_paused_ = false;
357 timeout = watchdog_timeout_ * watchdog_init_factor_;
358 watchdog_resume_timeticks_ = base::TimeTicks::Now();
359 break;
360 }
361
362 if (!is_backgrounded_ && !in_power_suspension_ && !is_paused_) {
363 weak_ptr_ = weak_factory_.GetWeakPtr();
364 task_runner()->PostDelayedTask(
365 FROM_HERE,
366 base::BindOnce(&GpuWatchdogThread::OnWatchdogTimeout, weak_ptr_),
367 timeout);
368 last_on_watchdog_timeout_timeticks_ = base::TimeTicks::Now();
369 next_on_watchdog_timeout_time_ = base::Time::Now() + timeout;
370 last_arm_disarm_counter_ = ReadArmDisarmCounter();
371 #if defined(OS_WIN)
372 if (watched_thread_handle_) {
373 last_on_watchdog_timeout_thread_ticks_ = GetWatchedThreadTime();
374 remaining_watched_thread_ticks_ = timeout;
375 }
376 #endif
377 }
378 }
379
StopWatchdogTimeoutTask(PauseResumeSource source_of_request)380 void GpuWatchdogThread::StopWatchdogTimeoutTask(
381 PauseResumeSource source_of_request) {
382 DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
383
384 switch (source_of_request) {
385 case kAndroidBackgroundForeground:
386 if (is_backgrounded_)
387 return;
388 is_backgrounded_ = true;
389 backgrounded_timeticks_ = base::TimeTicks::Now();
390 foregrounded_event_ = false;
391 break;
392 case kPowerSuspendResume:
393 if (in_power_suspension_)
394 return;
395 in_power_suspension_ = true;
396 power_suspend_timeticks_ = base::TimeTicks::Now();
397 power_resumed_event_ = false;
398 break;
399 case kGeneralGpuFlow:
400 if (is_paused_)
401 return;
402 is_paused_ = true;
403 watchdog_pause_timeticks_ = base::TimeTicks::Now();
404 break;
405 }
406
407 // Revoke any pending watchdog timeout task
408 weak_factory_.InvalidateWeakPtrs();
409 }
410
UpdateInitializationFlag()411 void GpuWatchdogThread::UpdateInitializationFlag() {
412 in_gpu_initialization_ = false;
413 }
414
415 // Called from the gpu main thread.
416 // The watchdog is armed only in these three functions -
417 // GpuWatchdogThread(), WillProcessTask(), and OnGpuProcessTearDown()
Arm()418 void GpuWatchdogThread::Arm() {
419 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
420
421 base::subtle::NoBarrier_AtomicIncrement(&arm_disarm_counter_, 1);
422
423 // Arm/Disarm are always called in sequence. Now it's an odd number.
424 DCHECK(IsArmed());
425 }
426
Disarm()427 void GpuWatchdogThread::Disarm() {
428 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
429
430 base::subtle::NoBarrier_AtomicIncrement(&arm_disarm_counter_, 1);
431
432 // Arm/Disarm are always called in sequence. Now it's an even number.
433 DCHECK(!IsArmed());
434 }
435
InProgress()436 void GpuWatchdogThread::InProgress() {
437 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
438
439 // Increment by 2. This is equivalent to Disarm() + Arm().
440 base::subtle::NoBarrier_AtomicIncrement(&arm_disarm_counter_, 2);
441
442 // Now it's an odd number.
443 DCHECK(IsArmed());
444 }
445
IsArmed()446 bool GpuWatchdogThread::IsArmed() {
447 // It's an odd number.
448 return base::subtle::NoBarrier_Load(&arm_disarm_counter_) & 1;
449 }
450
ReadArmDisarmCounter()451 base::subtle::Atomic32 GpuWatchdogThread::ReadArmDisarmCounter() {
452 return base::subtle::NoBarrier_Load(&arm_disarm_counter_);
453 }
454
455 // Running on the watchdog thread.
OnWatchdogTimeout()456 void GpuWatchdogThread::OnWatchdogTimeout() {
457 DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
458 DCHECK(!is_backgrounded_);
459 DCHECK(!in_power_suspension_);
460 DCHECK(!is_paused_);
461
462 // If this metric is added too early (eg. watchdog creation time), it cannot
463 // be persistent. The histogram data will be lost after crash or browser exit.
464 // Delay the recording of kGpuWatchdogStart until the firs
465 // OnWatchdogTimeout() to ensure this metric is created in the persistent
466 // memory.
467 if (!is_watchdog_start_histogram_recorded) {
468 is_watchdog_start_histogram_recorded = true;
469 GpuWatchdogHistogram(GpuWatchdogThreadEvent::kGpuWatchdogStart);
470 }
471
472 auto arm_disarm_counter = ReadArmDisarmCounter();
473 GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kTimeout);
474 if (power_resumed_event_)
475 num_of_timeout_after_power_resume_++;
476 if (foregrounded_event_)
477 num_of_timeout_after_foregrounded_++;
478
479 #if defined(USE_X11) && !defined(OS_BSD)
480 UpdateActiveTTY();
481 #endif
482
483 // Collect all needed info for gpu hang detection.
484 bool disarmed = arm_disarm_counter % 2 == 0; // even number
485 bool gpu_makes_progress = arm_disarm_counter != last_arm_disarm_counter_;
486 bool no_gpu_hang = disarmed || gpu_makes_progress || SlowWatchdogThread();
487
488 bool watched_thread_needs_more_time =
489 WatchedThreadNeedsMoreThreadTime(no_gpu_hang);
490 no_gpu_hang = no_gpu_hang || watched_thread_needs_more_time ||
491 ContinueOnNonHostX11ServerTty();
492
493 bool allows_extra_timeout = WatchedThreadGetsExtraTimeout(no_gpu_hang);
494 no_gpu_hang = no_gpu_hang || allows_extra_timeout;
495
496 // No gpu hang. Continue with another OnWatchdogTimeout task.
497 if (no_gpu_hang) {
498 last_on_watchdog_timeout_timeticks_ = base::TimeTicks::Now();
499 next_on_watchdog_timeout_time_ = base::Time::Now() + watchdog_timeout_;
500 last_arm_disarm_counter_ = ReadArmDisarmCounter();
501
502 task_runner()->PostDelayedTask(
503 FROM_HERE,
504 base::BindOnce(&GpuWatchdogThread::OnWatchdogTimeout, weak_ptr_),
505 watchdog_timeout_);
506 return;
507 }
508
509 // Still armed without any progress. GPU possibly hangs.
510 GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kKill);
511 #if defined(OS_WIN)
512 if (less_than_full_thread_time_after_capped_)
513 GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kKillOnLessThreadTime);
514 #endif
515
516 DeliberatelyTerminateToRecoverFromHang();
517 }
518
SlowWatchdogThread()519 bool GpuWatchdogThread::SlowWatchdogThread() {
520 // If it takes 15 more seconds than the expected time between two
521 // OnWatchdogTimeout() calls, the system is considered slow and it's not a GPU
522 // hang.
523 bool slow_watchdog_thread =
524 (base::Time::Now() - next_on_watchdog_timeout_time_) >=
525 base::TimeDelta::FromSeconds(15);
526
527 // Record this case only when a GPU hang is detected and the thread is slow.
528 if (slow_watchdog_thread)
529 GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kSlowWatchdogThread);
530
531 return slow_watchdog_thread;
532 }
533
WatchedThreadNeedsMoreThreadTime(bool no_gpu_hang_detected)534 bool GpuWatchdogThread::WatchedThreadNeedsMoreThreadTime(
535 bool no_gpu_hang_detected) {
536 #if defined(OS_WIN)
537 if (!watched_thread_handle_)
538 return false;
539
540 // We allow extra thread time. When that runs out, we extend extra timeout
541 // cycles. Now, we are extending extra timeout cycles. Don't add extra thread
542 // time.
543 if (count_of_extra_cycles_ > 0)
544 return false;
545
546 WatchedThreadNeedsMoreThreadTimeHistogram(
547 no_gpu_hang_detected,
548 /*start_of_more_thread_time*/ false);
549
550 if (!no_gpu_hang_detected && count_of_more_gpu_thread_time_allowed_ >=
551 kMaxCountOfMoreGpuThreadTimeAllowed) {
552 less_than_full_thread_time_after_capped_ = true;
553 } else {
554 less_than_full_thread_time_after_capped_ = false;
555 }
556
557 // Calculate how many thread ticks the watched thread spent doing the work.
558 base::ThreadTicks now = GetWatchedThreadTime();
559 base::TimeDelta thread_time_elapsed =
560 now - last_on_watchdog_timeout_thread_ticks_;
561 last_on_watchdog_timeout_thread_ticks_ = now;
562 remaining_watched_thread_ticks_ -= thread_time_elapsed;
563
564 if (no_gpu_hang_detected ||
565 count_of_more_gpu_thread_time_allowed_ >=
566 kMaxCountOfMoreGpuThreadTimeAllowed ||
567 thread_time_elapsed < base::TimeDelta() /* bogus data */ ||
568 remaining_watched_thread_ticks_ <= base::TimeDelta()) {
569 // Reset the remaining thread ticks.
570 remaining_watched_thread_ticks_ = watchdog_timeout_;
571 count_of_more_gpu_thread_time_allowed_ = 0;
572
573 return false;
574 } else {
575 // This is the start of allowing more thread time.
576 if (count_of_more_gpu_thread_time_allowed_ == 0) {
577 WatchedThreadNeedsMoreThreadTimeHistogram(
578 no_gpu_hang_detected, /*start_of_more_thread_time*/ true);
579 }
580 count_of_more_gpu_thread_time_allowed_++;
581
582 return true;
583 }
584 #else
585 return false;
586 #endif
587 }
588
589 #if defined(OS_WIN)
GetWatchedThreadTime()590 base::ThreadTicks GpuWatchdogThread::GetWatchedThreadTime() {
591 DCHECK(watched_thread_handle_);
592
593 if (base::ThreadTicks::IsSupported()) {
594 // Note: GetForThread() might return bogus results if running on different
595 // CPUs between two calls.
596 return base::ThreadTicks::GetForThread(
597 base::PlatformThreadHandle(watched_thread_handle_));
598 } else {
599 FILETIME creation_time;
600 FILETIME exit_time;
601 FILETIME kernel_time;
602 FILETIME user_time;
603 BOOL result = GetThreadTimes(watched_thread_handle_, &creation_time,
604 &exit_time, &kernel_time, &user_time);
605 if (!result)
606 return base::ThreadTicks();
607
608 // Need to bit_cast to fix alignment, then divide by 10 to convert
609 // 100-nanoseconds to microseconds.
610 int64_t user_time_us = bit_cast<int64_t, FILETIME>(user_time) / 10;
611 int64_t kernel_time_us = bit_cast<int64_t, FILETIME>(kernel_time) / 10;
612
613 return base::ThreadTicks() +
614 base::TimeDelta::FromMicroseconds(user_time_us + kernel_time_us);
615 }
616 }
617 #endif
618
WatchedThreadGetsExtraTimeout(bool no_gpu_hang)619 bool GpuWatchdogThread::WatchedThreadGetsExtraTimeout(bool no_gpu_hang) {
620 if (max_extra_cycles_before_kill_ == 0)
621 return false;
622
623 // We want to record histograms even if there is no gpu hang.
624 bool allows_more_timeouts = false;
625 WatchedThreadGetsExtraTimeoutHistogram(no_gpu_hang);
626
627 if (no_gpu_hang) {
628 if (count_of_extra_cycles_ > 0) {
629 count_of_extra_cycles_ = 0;
630 }
631 } else if (count_of_extra_cycles_ < max_extra_cycles_before_kill_) {
632 count_of_extra_cycles_++;
633 allows_more_timeouts = true;
634 }
635
636 return allows_more_timeouts;
637 }
638
DeliberatelyTerminateToRecoverFromHang()639 void GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang() {
640 DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
641 // If this is for gpu testing, do not terminate the gpu process.
642 if (is_test_mode_) {
643 test_result_timeout_and_gpu_hang_.Set();
644 return;
645 }
646
647 #if defined(OS_WIN)
648 if (IsDebuggerPresent())
649 return;
650 #endif
651
652 // Store variables so they're available in crash dumps to help determine the
653 // cause of any hang.
654 base::TimeTicks function_begin_timeticks = base::TimeTicks::Now();
655 base::debug::Alias(&in_gpu_initialization_);
656 base::debug::Alias(&num_of_timeout_after_power_resume_);
657 base::debug::Alias(&num_of_timeout_after_foregrounded_);
658 base::debug::Alias(&function_begin_timeticks);
659 base::debug::Alias(&watchdog_start_timeticks_);
660 base::debug::Alias(&power_suspend_timeticks_);
661 base::debug::Alias(&power_resume_timeticks_);
662 base::debug::Alias(&backgrounded_timeticks_);
663 base::debug::Alias(&foregrounded_timeticks_);
664 base::debug::Alias(&watchdog_pause_timeticks_);
665 base::debug::Alias(&watchdog_resume_timeticks_);
666 base::debug::Alias(&in_power_suspension_);
667 base::debug::Alias(&in_gpu_process_teardown_);
668 base::debug::Alias(&is_backgrounded_);
669 base::debug::Alias(&is_add_power_observer_called_);
670 base::debug::Alias(&is_power_observer_added_);
671 base::debug::Alias(&last_on_watchdog_timeout_timeticks_);
672 base::TimeDelta timeticks_elapses =
673 function_begin_timeticks - last_on_watchdog_timeout_timeticks_;
674 base::debug::Alias(&timeticks_elapses);
675 base::debug::Alias(&max_extra_cycles_before_kill_);
676 #if defined(OS_WIN)
677 base::debug::Alias(&remaining_watched_thread_ticks_);
678 base::debug::Alias(&less_than_full_thread_time_after_capped_);
679 #endif
680
681 GpuWatchdogHistogram(GpuWatchdogThreadEvent::kGpuWatchdogKill);
682
683 crash_keys::gpu_watchdog_crashed_in_gpu_init.Set(
684 in_gpu_initialization_ ? "1" : "0");
685
686 crash_keys::gpu_watchdog_kill_after_power_resume.Set(
687 WithinOneMinFromPowerResumed() ? "1" : "0");
688
689 crash_keys::num_of_processors.Set(base::NumberToString(num_of_processors_));
690
691 // Check the arm_disarm_counter value one more time.
692 auto last_arm_disarm_counter = ReadArmDisarmCounter();
693 base::debug::Alias(&last_arm_disarm_counter);
694
695 // Use RESULT_CODE_HUNG so this crash is separated from other
696 // EXCEPTION_ACCESS_VIOLATION buckets for UMA analysis.
697 // Create a crash dump first. TerminateCurrentProcessImmediately will not
698 // create a dump.
699 base::debug::DumpWithoutCrashing();
700 base::Process::TerminateCurrentProcessImmediately(RESULT_CODE_HUNG);
701 }
702
GpuWatchdogHistogram(GpuWatchdogThreadEvent thread_event)703 void GpuWatchdogThread::GpuWatchdogHistogram(
704 GpuWatchdogThreadEvent thread_event) {
705 base::UmaHistogramEnumeration("GPU.WatchdogThread.Event", thread_event);
706 }
707
GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent timeout_event)708 void GpuWatchdogThread::GpuWatchdogTimeoutHistogram(
709 GpuWatchdogTimeoutEvent timeout_event) {
710 base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout", timeout_event);
711
712 bool recorded = false;
713 if (in_gpu_initialization_) {
714 base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.Init",
715 timeout_event);
716 recorded = true;
717 }
718
719 if (WithinOneMinFromPowerResumed()) {
720 base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.PowerResume",
721 timeout_event);
722 recorded = true;
723 }
724
725 if (WithinOneMinFromForegrounded()) {
726 base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.Foregrounded",
727 timeout_event);
728 recorded = true;
729 }
730
731 if (!recorded) {
732 base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.Normal",
733 timeout_event);
734 }
735 }
736
737 #if defined(OS_WIN)
RecordExtraThreadTimeHistogram()738 void GpuWatchdogThread::RecordExtraThreadTimeHistogram() {
739 // Record the number of timeouts the GPU main thread needs to make a progress
740 // after GPU OnWatchdogTimeout() is triggered. The maximum count is 6 which
741 // is more than kMaxCountOfMoreGpuThreadTimeAllowed(4);
742 constexpr int kMin = 1;
743 constexpr int kMax = 6;
744 constexpr int kBuckets = 6;
745 int count = count_of_more_gpu_thread_time_allowed_;
746 bool recorded = false;
747
748 base::UmaHistogramCustomCounts("GPU.WatchdogThread.ExtraThreadTime", count,
749 kMin, kMax, kBuckets);
750
751 if (in_gpu_initialization_) {
752 base::UmaHistogramCustomCounts("GPU.WatchdogThread.ExtraThreadTime.Init",
753 count, kMin, kMax, kBuckets);
754 recorded = true;
755 }
756
757 if (WithinOneMinFromPowerResumed()) {
758 base::UmaHistogramCustomCounts(
759 "GPU.WatchdogThread.ExtraThreadTime.PowerResume", count, kMin, kMax,
760 kBuckets);
761 recorded = true;
762 }
763
764 if (WithinOneMinFromForegrounded()) {
765 base::UmaHistogramCustomCounts(
766 "GPU.WatchdogThread.ExtraThreadTime.Foregrounded", count, kMin, kMax,
767 kBuckets);
768 recorded = true;
769 }
770
771 if (!recorded) {
772 base::UmaHistogramCustomCounts("GPU.WatchdogThread.ExtraThreadTime.Normal",
773 count, kMin, kMax, kBuckets);
774 }
775 }
776
RecordNumOfUsersWaitingWithExtraThreadTimeHistogram(int count)777 void GpuWatchdogThread::RecordNumOfUsersWaitingWithExtraThreadTimeHistogram(
778 int count) {
779 constexpr int kMax = 4;
780
781 base::UmaHistogramExactLinear("GPU.WatchdogThread.ExtraThreadTime.NumOfUsers",
782 count, kMax);
783 }
784
WatchedThreadNeedsMoreThreadTimeHistogram(bool no_gpu_hang_detected,bool start_of_more_thread_time)785 void GpuWatchdogThread::WatchedThreadNeedsMoreThreadTimeHistogram(
786 bool no_gpu_hang_detected,
787 bool start_of_more_thread_time) {
788 if (start_of_more_thread_time) {
789 // This is the start of allowing more thread time. Only record it once for
790 // all following timeouts on the same detected gpu hang, so we know this
791 // is equivlent one crash in our crash reports.
792 GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kMoreThreadTime);
793 RecordNumOfUsersWaitingWithExtraThreadTimeHistogram(0);
794 } else {
795 if (count_of_more_gpu_thread_time_allowed_ > 0) {
796 if (no_gpu_hang_detected) {
797 // If count_of_more_gpu_thread_time_allowed_ > 0, we know extra time was
798 // extended in the previous OnWatchdogTimeout(). Now we find gpu makes
799 // progress. Record this case.
800 GpuWatchdogTimeoutHistogram(
801 GpuWatchdogTimeoutEvent::kProgressAfterMoreThreadTime);
802 RecordExtraThreadTimeHistogram();
803 } else {
804 if (count_of_more_gpu_thread_time_allowed_ >=
805 kMaxCountOfMoreGpuThreadTimeAllowed) {
806 GpuWatchdogTimeoutHistogram(
807 GpuWatchdogTimeoutEvent::kLessThanFullThreadTimeAfterCapped);
808 }
809 }
810
811 // Records the number of users who are still waiting. We can use this
812 // number to calculate the number of users who had already quit.
813 RecordNumOfUsersWaitingWithExtraThreadTimeHistogram(
814 count_of_more_gpu_thread_time_allowed_);
815
816 // Used by GPU.WatchdogThread.WaitTime later
817 time_in_wait_for_full_thread_time_ =
818 count_of_more_gpu_thread_time_allowed_ * watchdog_timeout_;
819 }
820 }
821 }
822 #endif
823
WatchedThreadGetsExtraTimeoutHistogram(bool no_gpu_hang)824 void GpuWatchdogThread::WatchedThreadGetsExtraTimeoutHistogram(
825 bool no_gpu_hang) {
826 constexpr int kMax = 60;
827 if (count_of_extra_cycles_ == 0 && !no_gpu_hang) {
828 GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kTimeoutWait);
829 base::UmaHistogramExactLinear("GPU.WatchdogThread.WaitTime.NumOfUsers", 0,
830 kMax);
831 } else if (count_of_extra_cycles_ > 0) {
832 int count = watchdog_timeout_.InSeconds() * count_of_extra_cycles_;
833 base::UmaHistogramExactLinear("GPU.WatchdogThread.WaitTime.NumOfUsers",
834 count, kMax);
835 if (no_gpu_hang) {
836 GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kProgressAfterWait);
837 base::UmaHistogramExactLinear(
838 "GPU.WatchdogThread.WaitTime.ProgressAfterWait", count, kMax);
839
840 #if defined(OS_WIN)
841 // Add the time the GPU thread was given for the full thread time up to 60
842 // seconds. GPU.WatchdogThread.WaitTime is essentially equal to
843 // GPU.WatchdogThread.WaitTime.ProgressAfterWait on non-Windows systems.
844 base::TimeDelta wait_time = base::TimeDelta::FromSeconds(count);
845 wait_time += time_in_wait_for_full_thread_time_;
846
847 constexpr base::TimeDelta kMinTime = base::TimeDelta::FromSeconds(1);
848 constexpr base::TimeDelta kMaxTime = base::TimeDelta::FromSeconds(150);
849 constexpr int kBuckets = 50;
850
851 // The time the GPU main thread takes to finish a task after a "hang" is
852 // dectedted.
853 base::UmaHistogramCustomTimes("GPU.WatchdogThread.WaitTime", wait_time,
854 kMinTime, kMaxTime, kBuckets);
855 #endif
856 }
857 }
858 }
859
WithinOneMinFromPowerResumed()860 bool GpuWatchdogThread::WithinOneMinFromPowerResumed() {
861 size_t count = base::ClampFloor<size_t>(base::TimeDelta::FromMinutes(1) /
862 watchdog_timeout_);
863 return power_resumed_event_ && num_of_timeout_after_power_resume_ <= count;
864 }
865
WithinOneMinFromForegrounded()866 bool GpuWatchdogThread::WithinOneMinFromForegrounded() {
867 size_t count = base::ClampFloor<size_t>(base::TimeDelta::FromMinutes(1) /
868 watchdog_timeout_);
869 return foregrounded_event_ && num_of_timeout_after_foregrounded_ <= count;
870 }
871
872 #if defined(USE_X11) && !defined(OS_BSD)
UpdateActiveTTY()873 void GpuWatchdogThread::UpdateActiveTTY() {
874 last_active_tty_ = active_tty_;
875
876 active_tty_ = -1;
877 char tty_string[8] = {0};
878 if (tty_file_ && !fseek(tty_file_, 0, SEEK_SET) &&
879 fread(tty_string, 1, 7, tty_file_)) {
880 int tty_number;
881 if (sscanf(tty_string, "tty%d\n", &tty_number) == 1) {
882 active_tty_ = tty_number;
883 }
884 }
885 }
886 #endif
887
ContinueOnNonHostX11ServerTty()888 bool GpuWatchdogThread::ContinueOnNonHostX11ServerTty() {
889 #if defined(USE_X11) && !defined(OS_BSD)
890 if (host_tty_ == -1 || active_tty_ == -1)
891 return false;
892
893 // Don't crash if we're not on the TTY of our host X11 server.
894 if (active_tty_ != host_tty_) {
895 // Only record for the time there is a change on TTY
896 if (last_active_tty_ == active_tty_) {
897 GpuWatchdogTimeoutHistogram(
898 GpuWatchdogTimeoutEvent::kContinueOnNonHostServerTty);
899 }
900 return true;
901 }
902 #endif
903 return false;
904 }
905
906 // For gpu testing only. Return whether a GPU hang was detected or not.
IsGpuHangDetectedForTesting()907 bool GpuWatchdogThread::IsGpuHangDetectedForTesting() {
908 DCHECK(is_test_mode_);
909 return test_result_timeout_and_gpu_hang_.IsSet();
910 }
911
912 // This should be called on the test main thread only. It will wait until the
913 // power observer is added on the watchdog thread.
WaitForPowerObserverAddedForTesting()914 void GpuWatchdogThread::WaitForPowerObserverAddedForTesting() {
915 DCHECK(watched_gpu_task_runner_->BelongsToCurrentThread());
916 DCHECK(is_add_power_observer_called_);
917
918 // Just return if it has been added.
919 if (is_power_observer_added_)
920 return;
921
922 base::WaitableEvent event;
923 task_runner()->PostTask(
924 FROM_HERE,
925 base::BindOnce(&base::WaitableEvent::Signal, base::Unretained(&event)));
926 event.Wait();
927 }
928
929 } // namespace gpu
930