1 // Copyright 2018 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "content/browser/speech/tts_controller_impl.h"
6
7 #include <stddef.h>
8
9 #include <algorithm>
10 #include <string>
11 #include <vector>
12
13 #include "base/bind.h"
14 #include "base/containers/queue.h"
15 #include "base/json/json_reader.h"
16 #include "base/metrics/histogram_macros.h"
17 #include "base/metrics/user_metrics.h"
18 #include "base/values.h"
19 #include "build/build_config.h"
20 #include "content/browser/speech/tts_utterance_impl.h"
21 #include "content/public/browser/content_browser_client.h"
22 #include "content/public/browser/visibility.h"
23 #include "content/public/browser/web_contents.h"
24 #include "content/public/common/content_client.h"
25 #include "services/data_decoder/public/cpp/safe_xml_parser.h"
26 #include "services/data_decoder/public/mojom/xml_parser.mojom.h"
27 #include "third_party/blink/public/mojom/speech/speech_synthesis.mojom.h"
28 #include "ui/base/l10n/l10n_util.h"
29
30 #if defined(OS_CHROMEOS)
31 #include "content/public/browser/tts_controller_delegate.h"
32 #endif
33
34 namespace content {
35 namespace {
36 // A value to be used to indicate that there is no char index available.
37 const int kInvalidCharIndex = -1;
38
39 // A value to be used to indicate that there is no length available.
40 const int kInvalidLength = -1;
41
42 #if defined(OS_CHROMEOS)
VoiceIdMatches(const base::Optional<TtsControllerDelegate::PreferredVoiceId> & id,const content::VoiceData & voice)43 bool VoiceIdMatches(
44 const base::Optional<TtsControllerDelegate::PreferredVoiceId>& id,
45 const content::VoiceData& voice) {
46 if (!id.has_value() || voice.name.empty() ||
47 (voice.engine_id.empty() && !voice.native))
48 return false;
49 if (voice.native)
50 return id->name == voice.name && id->id.empty();
51 return id->name == voice.name && id->id == voice.engine_id;
52 }
53 #endif // defined(OS_CHROMEOS)
54
AsUtteranceImpl(TtsUtterance * utterance)55 TtsUtteranceImpl* AsUtteranceImpl(TtsUtterance* utterance) {
56 return static_cast<TtsUtteranceImpl*>(utterance);
57 }
58
59 } // namespace
60
61 //
62 // VoiceData
63 //
64
VoiceData()65 VoiceData::VoiceData() : remote(false), native(false) {}
66
67 VoiceData::VoiceData(const VoiceData& other) = default;
68
~VoiceData()69 VoiceData::~VoiceData() {}
70
71 //
72 // TtsController
73 //
74
GetInstance()75 TtsController* TtsController::GetInstance() {
76 return TtsControllerImpl::GetInstance();
77 }
78
79 // IMPORTANT!
80 // These values are written to logs. Do not renumber or delete
81 // existing items; add new entries to the end of the list.
82 enum class UMATextToSpeechEvent {
83 START = 0,
84 END = 1,
85 WORD = 2,
86 SENTENCE = 3,
87 MARKER = 4,
88 INTERRUPTED = 5,
89 CANCELLED = 6,
90 SPEECH_ERROR = 7,
91 PAUSE = 8,
92 RESUME = 9,
93
94 // This must always be the last enum. It's okay for its value to
95 // increase, but none of the other enum values may change.
96 COUNT
97 };
98
99 //
100 // TtsControllerImpl
101 //
102
103 // static
GetInstance()104 TtsControllerImpl* TtsControllerImpl::GetInstance() {
105 return base::Singleton<TtsControllerImpl>::get();
106 }
107
SetStopSpeakingWhenHidden(bool value)108 void TtsControllerImpl::SetStopSpeakingWhenHidden(bool value) {
109 stop_speaking_when_hidden_ = value;
110 }
111
112 TtsControllerImpl::TtsControllerImpl() = default;
113
~TtsControllerImpl()114 TtsControllerImpl::~TtsControllerImpl() {
115 if (current_utterance_) {
116 current_utterance_->Finish();
117 SetCurrentUtterance(nullptr);
118 }
119
120 // Clear any queued utterances too.
121 ClearUtteranceQueue(false); // Don't sent events.
122 }
123
SpeakOrEnqueue(std::unique_ptr<TtsUtterance> utterance)124 void TtsControllerImpl::SpeakOrEnqueue(
125 std::unique_ptr<TtsUtterance> utterance) {
126 if (!ShouldSpeakUtterance(utterance.get())) {
127 utterance->Finish();
128 return;
129 }
130
131 // If the TTS platform is still loading, queue or flush the utterance. The
132 // utterances can be sent to platform specific implementation or to the
133 // engine implementation. Every utterances are postponed until the platform
134 // specific implementation is loaded to avoid racy behaviors.
135 if (TtsPlatformLoading()) {
136 bool can_enqueue = utterance->GetCanEnqueue();
137 utterance_list_.emplace_back(std::move(utterance));
138 if (!can_enqueue)
139 ClearUtteranceQueue(true);
140 return;
141 }
142
143 // If we're paused and we get an utterance that can't be queued,
144 // flush the queue but stay in the paused state.
145 if (paused_ && !utterance->GetCanEnqueue()) {
146 utterance_list_.emplace_back(std::move(utterance));
147 Stop();
148 paused_ = true;
149 return;
150 }
151
152 if (paused_ || (IsSpeaking() && utterance->GetCanEnqueue())) {
153 utterance_list_.emplace_back(std::move(utterance));
154 } else {
155 Stop();
156 SpeakNow(std::move(utterance));
157 }
158 }
159
Stop()160 void TtsControllerImpl::Stop() {
161 StopAndClearQueue(GURL());
162 }
163
Stop(const GURL & source_url)164 void TtsControllerImpl::Stop(const GURL& source_url) {
165 StopAndClearQueue(source_url);
166 }
167
StopAndClearQueue(const GURL & source_url)168 void TtsControllerImpl::StopAndClearQueue(const GURL& source_url) {
169 if (StopCurrentUtteranceIfMatches(source_url))
170 ClearUtteranceQueue(true);
171 }
172
StopCurrentUtteranceIfMatches(const GURL & source_url)173 bool TtsControllerImpl::StopCurrentUtteranceIfMatches(const GURL& source_url) {
174 base::RecordAction(base::UserMetricsAction("TextToSpeech.Stop"));
175
176 paused_ = false;
177
178 if (!source_url.is_empty() && current_utterance_ &&
179 current_utterance_->GetSrcUrl().GetOrigin() != source_url.GetOrigin())
180 return false;
181
182 if (current_utterance_ && !current_utterance_->GetEngineId().empty()) {
183 if (engine_delegate_)
184 engine_delegate_->Stop(current_utterance_.get());
185 } else if (TtsPlatformReady()) {
186 GetTtsPlatform()->ClearError();
187 GetTtsPlatform()->StopSpeaking();
188 }
189
190 if (current_utterance_)
191 current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex,
192 kInvalidLength, std::string());
193 FinishCurrentUtterance();
194 return true;
195 }
196
Pause()197 void TtsControllerImpl::Pause() {
198 base::RecordAction(base::UserMetricsAction("TextToSpeech.Pause"));
199 if (paused_)
200 return;
201
202 paused_ = true;
203 if (current_utterance_ && !current_utterance_->GetEngineId().empty()) {
204 if (engine_delegate_)
205 engine_delegate_->Pause(current_utterance_.get());
206 } else if (current_utterance_) {
207 DCHECK(TtsPlatformReady());
208 GetTtsPlatform()->ClearError();
209 GetTtsPlatform()->Pause();
210 }
211 }
212
Resume()213 void TtsControllerImpl::Resume() {
214 base::RecordAction(base::UserMetricsAction("TextToSpeech.Resume"));
215 if (!paused_)
216 return;
217
218 paused_ = false;
219 if (current_utterance_ && !current_utterance_->GetEngineId().empty()) {
220 if (engine_delegate_)
221 engine_delegate_->Resume(current_utterance_.get());
222 } else if (current_utterance_) {
223 DCHECK(TtsPlatformReady());
224 GetTtsPlatform()->ClearError();
225 GetTtsPlatform()->Resume();
226 } else {
227 SpeakNextUtterance();
228 }
229 }
230
OnTtsEvent(int utterance_id,TtsEventType event_type,int char_index,int length,const std::string & error_message)231 void TtsControllerImpl::OnTtsEvent(int utterance_id,
232 TtsEventType event_type,
233 int char_index,
234 int length,
235 const std::string& error_message) {
236 // We may sometimes receive completion callbacks "late", after we've
237 // already finished the utterance (for example because another utterance
238 // interrupted or we got a call to Stop). This is normal and we can
239 // safely just ignore these events.
240 if (!current_utterance_ || utterance_id != current_utterance_->GetId()) {
241 return;
242 }
243
244 UMATextToSpeechEvent metric;
245 switch (event_type) {
246 case TTS_EVENT_START:
247 metric = UMATextToSpeechEvent::START;
248 break;
249 case TTS_EVENT_END:
250 metric = UMATextToSpeechEvent::END;
251 break;
252 case TTS_EVENT_WORD:
253 metric = UMATextToSpeechEvent::WORD;
254 break;
255 case TTS_EVENT_SENTENCE:
256 metric = UMATextToSpeechEvent::SENTENCE;
257 break;
258 case TTS_EVENT_MARKER:
259 metric = UMATextToSpeechEvent::MARKER;
260 break;
261 case TTS_EVENT_INTERRUPTED:
262 metric = UMATextToSpeechEvent::INTERRUPTED;
263 break;
264 case TTS_EVENT_CANCELLED:
265 metric = UMATextToSpeechEvent::CANCELLED;
266 break;
267 case TTS_EVENT_ERROR:
268 metric = UMATextToSpeechEvent::SPEECH_ERROR;
269 break;
270 case TTS_EVENT_PAUSE:
271 metric = UMATextToSpeechEvent::PAUSE;
272 break;
273 case TTS_EVENT_RESUME:
274 metric = UMATextToSpeechEvent::RESUME;
275 break;
276 default:
277 NOTREACHED();
278 return;
279 }
280 UMA_HISTOGRAM_ENUMERATION("TextToSpeech.Event", metric,
281 UMATextToSpeechEvent::COUNT);
282
283 current_utterance_->OnTtsEvent(event_type, char_index, length, error_message);
284 if (current_utterance_->IsFinished()) {
285 FinishCurrentUtterance();
286 SpeakNextUtterance();
287 }
288 }
289
GetVoices(BrowserContext * browser_context,std::vector<VoiceData> * out_voices)290 void TtsControllerImpl::GetVoices(BrowserContext* browser_context,
291 std::vector<VoiceData>* out_voices) {
292 TtsPlatform* tts_platform = GetTtsPlatform();
293 DCHECK(tts_platform);
294 // Ensure we have all built-in voices loaded. This is a no-op if already
295 // loaded.
296 tts_platform->LoadBuiltInTtsEngine(browser_context);
297 if (TtsPlatformReady())
298 tts_platform->GetVoices(out_voices);
299
300 if (browser_context && engine_delegate_)
301 engine_delegate_->GetVoices(browser_context, out_voices);
302 }
303
IsSpeaking()304 bool TtsControllerImpl::IsSpeaking() {
305 return current_utterance_ != nullptr ||
306 (TtsPlatformReady() && GetTtsPlatform()->IsSpeaking());
307 }
308
VoicesChanged()309 void TtsControllerImpl::VoicesChanged() {
310 if (!voices_changed_delegates_.might_have_observers() || TtsPlatformLoading())
311 return;
312
313 // Existence of platform tts indicates explicit requests to tts. Since
314 // |VoicesChanged| can occur implicitly, only send if needed.
315 for (auto& delegate : voices_changed_delegates_)
316 delegate.OnVoicesChanged();
317
318 if (!current_utterance_ && !utterance_list_.empty())
319 SpeakNextUtterance();
320 }
321
AddVoicesChangedDelegate(VoicesChangedDelegate * delegate)322 void TtsControllerImpl::AddVoicesChangedDelegate(
323 VoicesChangedDelegate* delegate) {
324 voices_changed_delegates_.AddObserver(delegate);
325 }
326
RemoveVoicesChangedDelegate(VoicesChangedDelegate * delegate)327 void TtsControllerImpl::RemoveVoicesChangedDelegate(
328 VoicesChangedDelegate* delegate) {
329 voices_changed_delegates_.RemoveObserver(delegate);
330 }
331
RemoveUtteranceEventDelegate(UtteranceEventDelegate * delegate)332 void TtsControllerImpl::RemoveUtteranceEventDelegate(
333 UtteranceEventDelegate* delegate) {
334 // First clear any pending utterances with this delegate.
335 std::list<std::unique_ptr<TtsUtterance>> old_list;
336 utterance_list_.swap(old_list);
337 while (!old_list.empty()) {
338 std::unique_ptr<TtsUtterance> utterance = std::move(old_list.front());
339 old_list.pop_front();
340 if (utterance->GetEventDelegate() != delegate)
341 utterance_list_.emplace_back(std::move(utterance));
342 }
343
344 if (current_utterance_ &&
345 current_utterance_->GetEventDelegate() == delegate) {
346 current_utterance_->SetEventDelegate(nullptr);
347 if (!current_utterance_->GetEngineId().empty()) {
348 if (engine_delegate_)
349 engine_delegate_->Stop(current_utterance_.get());
350 } else {
351 DCHECK(TtsPlatformReady());
352 GetTtsPlatform()->ClearError();
353 GetTtsPlatform()->StopSpeaking();
354 }
355
356 FinishCurrentUtterance();
357 SpeakNextUtterance();
358 }
359 }
360
SetTtsEngineDelegate(TtsEngineDelegate * delegate)361 void TtsControllerImpl::SetTtsEngineDelegate(TtsEngineDelegate* delegate) {
362 engine_delegate_ = delegate;
363 }
364
GetTtsEngineDelegate()365 TtsEngineDelegate* TtsControllerImpl::GetTtsEngineDelegate() {
366 return engine_delegate_;
367 }
368
Shutdown()369 void TtsControllerImpl::Shutdown() {
370 if (tts_platform_)
371 tts_platform_->Shutdown();
372 }
373
OnBrowserContextDestroyed(BrowserContext * browser_context)374 void TtsControllerImpl::OnBrowserContextDestroyed(
375 BrowserContext* browser_context) {
376 bool did_clear_utterances = false;
377
378 // First clear the BrowserContext from any utterances.
379 for (std::unique_ptr<TtsUtterance>& utterance : utterance_list_) {
380 if (utterance->GetBrowserContext() == browser_context) {
381 utterance->ClearBrowserContext();
382 did_clear_utterances = true;
383 }
384 }
385
386 if (current_utterance_ &&
387 current_utterance_->GetBrowserContext() == browser_context) {
388 current_utterance_->ClearBrowserContext();
389 did_clear_utterances = true;
390 }
391
392 // If we cleared the BrowserContext from any utterances, stop speech
393 // just to be safe. Do this using PostTask because calling Stop might
394 // try to send notifications and that can trigger code paths that try
395 // to access the BrowserContext that's being deleted. Note that it's
396 // safe to use base::Unretained because this is a singleton.
397 if (did_clear_utterances) {
398 base::ThreadTaskRunnerHandle::Get()->PostTask(
399 FROM_HERE, base::BindOnce(&TtsControllerImpl::StopAndClearQueue,
400 base::Unretained(this), GURL()));
401 }
402 }
403
SetTtsPlatform(TtsPlatform * tts_platform)404 void TtsControllerImpl::SetTtsPlatform(TtsPlatform* tts_platform) {
405 tts_platform_ = tts_platform;
406 }
407
QueueSize()408 int TtsControllerImpl::QueueSize() {
409 return static_cast<int>(utterance_list_.size());
410 }
411
GetTtsPlatform()412 TtsPlatform* TtsControllerImpl::GetTtsPlatform() {
413 if (!tts_platform_)
414 tts_platform_ = TtsPlatform::GetInstance();
415 DCHECK(tts_platform_);
416 return tts_platform_;
417 }
418
TtsPlatformReady()419 bool TtsControllerImpl::TtsPlatformReady() {
420 TtsPlatform* tts_platform = GetTtsPlatform();
421 return tts_platform->PlatformImplSupported() &&
422 tts_platform->PlatformImplInitialized();
423 }
424
TtsPlatformLoading()425 bool TtsControllerImpl::TtsPlatformLoading() {
426 // If the platform implementation is supported, it is considered to be in
427 // loading state until the platform is inititialized. Typically, that means
428 // the libraries are loaded and the voices are being loaded.
429 TtsPlatform* tts_platform = GetTtsPlatform();
430 return tts_platform->PlatformImplSupported() &&
431 !tts_platform->PlatformImplInitialized();
432 }
433
SpeakNow(std::unique_ptr<TtsUtterance> utterance)434 void TtsControllerImpl::SpeakNow(std::unique_ptr<TtsUtterance> utterance) {
435 // Get all available voices and try to find a matching voice.
436 std::vector<VoiceData> voices;
437 GetVoices(utterance->GetBrowserContext(), &voices);
438
439 // Get the best matching voice. If nothing matches, just set "native"
440 // to true because that might trigger deferred loading of native voices.
441 // TODO(katie): Move most of the GetMatchingVoice logic into content/ and
442 // use the TTS controller delegate to get chrome-specific info as needed.
443 int index = GetMatchingVoice(utterance.get(), voices);
444 VoiceData voice;
445 if (index >= 0)
446 voice = voices[index];
447 else
448 voice.native = true;
449
450 UpdateUtteranceDefaults(utterance.get());
451
452 GetTtsPlatform()->WillSpeakUtteranceWithVoice(utterance.get(), voice);
453
454 base::RecordAction(base::UserMetricsAction("TextToSpeech.Speak"));
455 UMA_HISTOGRAM_COUNTS_100000("TextToSpeech.Utterance.TextLength",
456 utterance->GetText().size());
457 UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.FromExtensionAPI",
458 !utterance->GetSrcUrl().is_empty());
459 UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasVoiceName",
460 !utterance->GetVoiceName().empty());
461 UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasLang",
462 !utterance->GetLang().empty());
463 UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasRate",
464 utterance->GetContinuousParameters().rate != 1.0);
465 UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasPitch",
466 utterance->GetContinuousParameters().pitch != 1.0);
467 UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasVolume",
468 utterance->GetContinuousParameters().volume != 1.0);
469 UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.Native", voice.native);
470
471 if (!voice.native) {
472 #if !defined(OS_ANDROID)
473 DCHECK(!voice.engine_id.empty());
474 SetCurrentUtterance(std::move(utterance));
475 current_utterance_->SetEngineId(voice.engine_id);
476 if (engine_delegate_)
477 engine_delegate_->Speak(current_utterance_.get(), voice);
478 bool sends_end_event =
479 voice.events.find(TTS_EVENT_END) != voice.events.end();
480 if (!sends_end_event) {
481 current_utterance_->Finish();
482 SetCurrentUtterance(nullptr);
483 SpeakNextUtterance();
484 }
485 #endif // !defined(OS_ANDROID)
486 } else {
487 // It's possible for certain platforms to send start events immediately
488 // during |speak|.
489 SetCurrentUtterance(std::move(utterance));
490 if (TtsPlatformReady()) {
491 GetTtsPlatform()->ClearError();
492 GetTtsPlatform()->Speak(
493 current_utterance_->GetId(), current_utterance_->GetText(),
494 current_utterance_->GetLang(), voice,
495 current_utterance_->GetContinuousParameters(),
496 base::BindOnce(&TtsControllerImpl::OnSpeakFinished,
497 base::Unretained(this), current_utterance_->GetId()));
498 } else {
499 // The TTS platform is not supported.
500 OnSpeakFinished(current_utterance_->GetId(), false);
501 }
502 }
503 }
504
OnSpeakFinished(int utterance_id,bool success)505 void TtsControllerImpl::OnSpeakFinished(int utterance_id, bool success) {
506 if (success)
507 return;
508
509 // Since OnSpeakFinished could run asynchronously, it is possible that the
510 // current utterance has changed. Ignore any such spurious callbacks.
511 if (!current_utterance_ || current_utterance_->GetId() != utterance_id)
512 return;
513
514 // If the native voice wasn't able to process this speech, see if
515 // the browser has built-in TTS that isn't loaded yet.
516 if (GetTtsPlatform()->LoadBuiltInTtsEngine(
517 current_utterance_->GetBrowserContext())) {
518 utterance_list_.emplace_back(std::move(current_utterance_));
519 return;
520 }
521
522 current_utterance_->OnTtsEvent(TTS_EVENT_ERROR, kInvalidCharIndex,
523 kInvalidLength, GetTtsPlatform()->GetError());
524 SetCurrentUtterance(nullptr);
525 }
526
ClearUtteranceQueue(bool send_events)527 void TtsControllerImpl::ClearUtteranceQueue(bool send_events) {
528 while (!utterance_list_.empty()) {
529 std::unique_ptr<TtsUtterance> utterance =
530 std::move(utterance_list_.front());
531 utterance_list_.pop_front();
532 if (send_events) {
533 utterance->OnTtsEvent(TTS_EVENT_CANCELLED, kInvalidCharIndex,
534 kInvalidLength, std::string());
535 } else {
536 utterance->Finish();
537 }
538 }
539 }
540
FinishCurrentUtterance()541 void TtsControllerImpl::FinishCurrentUtterance() {
542 if (current_utterance_) {
543 if (!current_utterance_->IsFinished())
544 current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex,
545 kInvalidLength, std::string());
546 SetCurrentUtterance(nullptr);
547 }
548 }
549
SpeakNextUtterance()550 void TtsControllerImpl::SpeakNextUtterance() {
551 if (paused_)
552 return;
553
554 // Start speaking the next utterance in the queue. Keep trying in case
555 // one fails but there are still more in the queue to try.
556 while (!utterance_list_.empty() && !current_utterance_) {
557 std::unique_ptr<TtsUtterance> utterance =
558 std::move(utterance_list_.front());
559 utterance_list_.pop_front();
560 if (ShouldSpeakUtterance(utterance.get()))
561 SpeakNow(std::move(utterance));
562 else
563 utterance->Finish();
564 }
565 }
566
UpdateUtteranceDefaults(TtsUtterance * utterance)567 void TtsControllerImpl::UpdateUtteranceDefaults(TtsUtterance* utterance) {
568 double rate = utterance->GetContinuousParameters().rate;
569 double pitch = utterance->GetContinuousParameters().pitch;
570 double volume = utterance->GetContinuousParameters().volume;
571 #if defined(OS_CHROMEOS)
572 if (GetTtsControllerDelegate())
573 GetTtsControllerDelegate()->UpdateUtteranceDefaultsFromPrefs(
574 utterance, &rate, &pitch, &volume);
575 #else
576 // Update pitch, rate and volume to defaults if not explicity set on
577 // this utterance.
578 if (rate == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
579 rate = blink::mojom::kSpeechSynthesisDefaultRate;
580 if (pitch == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
581 pitch = blink::mojom::kSpeechSynthesisDefaultPitch;
582 if (volume == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
583 volume = blink::mojom::kSpeechSynthesisDefaultVolume;
584 #endif // defined(OS_CHROMEOS)
585 utterance->SetContinuousParameters(rate, pitch, volume);
586 }
587
StripSSML(const std::string & utterance,base::OnceCallback<void (const std::string &)> on_ssml_parsed)588 void TtsControllerImpl::StripSSML(
589 const std::string& utterance,
590 base::OnceCallback<void(const std::string&)> on_ssml_parsed) {
591 // Skip parsing and return if not xml.
592 if (utterance.find("<?xml") == std::string::npos) {
593 std::move(on_ssml_parsed).Run(utterance);
594 return;
595 }
596
597 // Parse using safe, out-of-process Xml Parser.
598 data_decoder::DataDecoder::ParseXmlIsolated(
599 utterance, base::BindOnce(&TtsControllerImpl::StripSSMLHelper, utterance,
600 std::move(on_ssml_parsed)));
601 }
602
603 // Called when ParseXml finishes.
604 // Uses parsed xml to build parsed utterance text.
StripSSMLHelper(const std::string & utterance,base::OnceCallback<void (const std::string &)> on_ssml_parsed,data_decoder::DataDecoder::ValueOrError result)605 void TtsControllerImpl::StripSSMLHelper(
606 const std::string& utterance,
607 base::OnceCallback<void(const std::string&)> on_ssml_parsed,
608 data_decoder::DataDecoder::ValueOrError result) {
609 // Error checks.
610 // If invalid xml, return original utterance text.
611 if (!result.value) {
612 std::move(on_ssml_parsed).Run(utterance);
613 return;
614 }
615
616 std::string root_tag_name;
617 data_decoder::GetXmlElementTagName(*result.value, &root_tag_name);
618 // Root element must be <speak>.
619 if (root_tag_name.compare("speak") != 0) {
620 std::move(on_ssml_parsed).Run(utterance);
621 return;
622 }
623
624 std::string parsed_text;
625 // Change from unique_ptr to base::Value* so recursion will work.
626 PopulateParsedText(&parsed_text, &(*result.value));
627
628 // Run with parsed_text.
629 std::move(on_ssml_parsed).Run(parsed_text);
630 }
631
PopulateParsedText(std::string * parsed_text,const base::Value * element)632 void TtsControllerImpl::PopulateParsedText(std::string* parsed_text,
633 const base::Value* element) {
634 DCHECK(parsed_text);
635 if (!element)
636 return;
637 // Add element's text if present.
638 // Note: We don't use data_decoder::GetXmlElementText because it gets the text
639 // of element's first child, not text of current element.
640 const base::Value* text_value = element->FindKeyOfType(
641 data_decoder::mojom::XmlParser::kTextKey, base::Value::Type::STRING);
642 if (text_value)
643 *parsed_text += text_value->GetString();
644
645 const base::Value* children = data_decoder::GetXmlElementChildren(*element);
646 if (!children || !children->is_list())
647 return;
648
649 for (size_t i = 0; i < children->GetList().size(); ++i) {
650 // We need to iterate over all children because some text elements are
651 // nested within other types of elements, such as <emphasis> tags.
652 PopulateParsedText(parsed_text, &children->GetList()[i]);
653 }
654 }
655
GetMatchingVoice(TtsUtterance * utterance,const std::vector<VoiceData> & voices)656 int TtsControllerImpl::GetMatchingVoice(TtsUtterance* utterance,
657 const std::vector<VoiceData>& voices) {
658 const std::string app_lang =
659 GetContentClient()->browser()->GetApplicationLocale();
660 // Start with a best score of -1, that way even if none of the criteria
661 // match, something will be returned if there are any voices.
662 int best_score = -1;
663 int best_score_index = -1;
664 #if defined(OS_CHROMEOS)
665 TtsControllerDelegate* delegate = GetTtsControllerDelegate();
666 std::unique_ptr<TtsControllerDelegate::PreferredVoiceIds> preferred_ids =
667 delegate ? delegate->GetPreferredVoiceIdsForUtterance(utterance)
668 : nullptr;
669 #endif // defined(OS_CHROMEOS)
670 for (size_t i = 0; i < voices.size(); ++i) {
671 const content::VoiceData& voice = voices[i];
672 int score = 0;
673
674 // If the extension ID is specified, check for an exact match.
675 if (!utterance->GetEngineId().empty() &&
676 utterance->GetEngineId() != voice.engine_id)
677 continue;
678
679 // If the voice name is specified, check for an exact match.
680 if (!utterance->GetVoiceName().empty() &&
681 voice.name != utterance->GetVoiceName())
682 continue;
683
684 // Prefer the utterance language.
685 if (!voice.lang.empty() && !utterance->GetLang().empty()) {
686 // An exact language match is worth more than a partial match.
687 if (voice.lang == utterance->GetLang()) {
688 score += 128;
689 } else if (l10n_util::GetLanguage(voice.lang) ==
690 l10n_util::GetLanguage(utterance->GetLang())) {
691 score += 64;
692 }
693 }
694
695 // Next, prefer required event types.
696 if (!utterance->GetRequiredEventTypes().empty()) {
697 bool has_all_required_event_types = true;
698 for (TtsEventType event_type : utterance->GetRequiredEventTypes()) {
699 if (voice.events.find(event_type) == voice.events.end()) {
700 has_all_required_event_types = false;
701 break;
702 }
703 }
704 if (has_all_required_event_types)
705 score += 32;
706 }
707
708 #if defined(OS_CHROMEOS)
709 if (preferred_ids) {
710 // First prefer the user's preference voice for the utterance language,
711 // if the utterance language is specified.
712 if (!utterance->GetLang().empty() &&
713 VoiceIdMatches(preferred_ids->lang_voice_id, voice)) {
714 score += 16;
715 }
716
717 // Then prefer the user's preference voice for the system language.
718 // This is a lower priority match than the utterance voice.
719 if (VoiceIdMatches(preferred_ids->locale_voice_id, voice))
720 score += 8;
721
722 // Finally, prefer the user's preference voice for any language. This will
723 // pick the default voice if there is no better match for the current
724 // system language and utterance language.
725 if (VoiceIdMatches(preferred_ids->any_locale_voice_id, voice))
726 score += 4;
727 }
728 #endif // defined(OS_CHROMEOS)
729
730 // Finally, prefer system language.
731 if (!voice.lang.empty()) {
732 if (voice.lang == app_lang) {
733 score += 2;
734 } else if (l10n_util::GetLanguage(voice.lang) ==
735 l10n_util::GetLanguage(app_lang)) {
736 score += 1;
737 }
738 }
739
740 if (score > best_score) {
741 best_score = score;
742 best_score_index = i;
743 }
744 }
745
746 return best_score_index;
747 }
748
SetCurrentUtterance(std::unique_ptr<TtsUtterance> utterance)749 void TtsControllerImpl::SetCurrentUtterance(
750 std::unique_ptr<TtsUtterance> utterance) {
751 current_utterance_ = std::move(utterance);
752 Observe(current_utterance_
753 ? AsUtteranceImpl(current_utterance_.get())->web_contents()
754 : nullptr);
755 }
756
StopCurrentUtteranceAndRemoveUtterancesMatching(WebContents * wc)757 void TtsControllerImpl::StopCurrentUtteranceAndRemoveUtterancesMatching(
758 WebContents* wc) {
759 DCHECK(wc);
760 // Removes any utterances that match the WebContents from the current
761 // utterance (which our inherited WebContentsObserver starts observing every
762 // time the utterance changes).
763 //
764 // This is called when the WebContents for the current utterance is destroyed
765 // or hidden. In the case where it's destroyed, this is done to avoid
766 // attempting to start a utterance that is very likely to be destroyed right
767 // away, and there are also subtle timing issues if we didn't do this (if a
768 // queued utterance has already received WebContentsDestroyed(), and we start
769 // it, we won't get the corresponding WebContentsDestroyed()).
770 auto eraser = [wc](const std::unique_ptr<TtsUtterance>& utterance) {
771 TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance.get());
772 if (utterance_impl->web_contents() == wc) {
773 utterance_impl->Finish();
774 return true;
775 }
776 return false;
777 };
778 utterance_list_.erase(
779 std::remove_if(utterance_list_.begin(), utterance_list_.end(), eraser),
780 utterance_list_.end());
781 const bool stopped = StopCurrentUtteranceIfMatches(GURL());
782 DCHECK(stopped);
783 SpeakNextUtterance();
784 }
785
ShouldSpeakUtterance(TtsUtterance * utterance)786 bool TtsControllerImpl::ShouldSpeakUtterance(TtsUtterance* utterance) {
787 TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance);
788 if (!utterance_impl->was_created_with_web_contents())
789 return true;
790
791 // If the WebContents that created the utterance has been destroyed, don't
792 // speak it.
793 if (!utterance_impl->web_contents())
794 return false;
795
796 // Allow speaking if either the WebContents is visible, or the WebContents
797 // isn't required to be visible before speaking.
798 return !stop_speaking_when_hidden_ ||
799 utterance_impl->web_contents()->GetVisibility() != Visibility::HIDDEN;
800 }
801
802 //
803 // WebContentsObserver
804 //
805
WebContentsDestroyed()806 void TtsControllerImpl::WebContentsDestroyed() {
807 StopCurrentUtteranceAndRemoveUtterancesMatching(web_contents());
808 }
809
OnVisibilityChanged(Visibility visibility)810 void TtsControllerImpl::OnVisibilityChanged(Visibility visibility) {
811 if (visibility == Visibility::HIDDEN && stop_speaking_when_hidden_)
812 StopCurrentUtteranceAndRemoveUtterancesMatching(web_contents());
813 }
814
815 #if defined(OS_CHROMEOS)
GetTtsControllerDelegate()816 TtsControllerDelegate* TtsControllerImpl::GetTtsControllerDelegate() {
817 if (delegate_)
818 return delegate_;
819 if (GetContentClient() && GetContentClient()->browser()) {
820 delegate_ = GetContentClient()->browser()->GetTtsControllerDelegate();
821 return delegate_;
822 }
823 return nullptr;
824 }
825
SetTtsControllerDelegateForTesting(TtsControllerDelegate * delegate)826 void TtsControllerImpl::SetTtsControllerDelegateForTesting(
827 TtsControllerDelegate* delegate) {
828 delegate_ = delegate;
829 }
830
831 #endif // defined(OS_CHROMEOS)
832
833 } // namespace content
834