1 // Copyright 2018 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "content/browser/speech/tts_controller_impl.h"
6 
7 #include <stddef.h>
8 
9 #include <algorithm>
10 #include <string>
11 #include <vector>
12 
13 #include "base/bind.h"
14 #include "base/containers/queue.h"
15 #include "base/json/json_reader.h"
16 #include "base/metrics/histogram_macros.h"
17 #include "base/metrics/user_metrics.h"
18 #include "base/values.h"
19 #include "build/build_config.h"
20 #include "content/browser/speech/tts_utterance_impl.h"
21 #include "content/public/browser/content_browser_client.h"
22 #include "content/public/browser/visibility.h"
23 #include "content/public/browser/web_contents.h"
24 #include "content/public/common/content_client.h"
25 #include "services/data_decoder/public/cpp/safe_xml_parser.h"
26 #include "services/data_decoder/public/mojom/xml_parser.mojom.h"
27 #include "third_party/blink/public/mojom/speech/speech_synthesis.mojom.h"
28 #include "ui/base/l10n/l10n_util.h"
29 
30 #if defined(OS_CHROMEOS)
31 #include "content/public/browser/tts_controller_delegate.h"
32 #endif
33 
34 namespace content {
35 namespace {
36 // A value to be used to indicate that there is no char index available.
37 const int kInvalidCharIndex = -1;
38 
39 // A value to be used to indicate that there is no length available.
40 const int kInvalidLength = -1;
41 
42 #if defined(OS_CHROMEOS)
VoiceIdMatches(const base::Optional<TtsControllerDelegate::PreferredVoiceId> & id,const content::VoiceData & voice)43 bool VoiceIdMatches(
44     const base::Optional<TtsControllerDelegate::PreferredVoiceId>& id,
45     const content::VoiceData& voice) {
46   if (!id.has_value() || voice.name.empty() ||
47       (voice.engine_id.empty() && !voice.native))
48     return false;
49   if (voice.native)
50     return id->name == voice.name && id->id.empty();
51   return id->name == voice.name && id->id == voice.engine_id;
52 }
53 #endif  // defined(OS_CHROMEOS)
54 
AsUtteranceImpl(TtsUtterance * utterance)55 TtsUtteranceImpl* AsUtteranceImpl(TtsUtterance* utterance) {
56   return static_cast<TtsUtteranceImpl*>(utterance);
57 }
58 
59 }  // namespace
60 
61 //
62 // VoiceData
63 //
64 
VoiceData()65 VoiceData::VoiceData() : remote(false), native(false) {}
66 
67 VoiceData::VoiceData(const VoiceData& other) = default;
68 
~VoiceData()69 VoiceData::~VoiceData() {}
70 
71 //
72 // TtsController
73 //
74 
GetInstance()75 TtsController* TtsController::GetInstance() {
76   return TtsControllerImpl::GetInstance();
77 }
78 
79 // IMPORTANT!
80 // These values are written to logs.  Do not renumber or delete
81 // existing items; add new entries to the end of the list.
82 enum class UMATextToSpeechEvent {
83   START = 0,
84   END = 1,
85   WORD = 2,
86   SENTENCE = 3,
87   MARKER = 4,
88   INTERRUPTED = 5,
89   CANCELLED = 6,
90   SPEECH_ERROR = 7,
91   PAUSE = 8,
92   RESUME = 9,
93 
94   // This must always be the last enum. It's okay for its value to
95   // increase, but none of the other enum values may change.
96   COUNT
97 };
98 
99 //
100 // TtsControllerImpl
101 //
102 
103 // static
GetInstance()104 TtsControllerImpl* TtsControllerImpl::GetInstance() {
105   return base::Singleton<TtsControllerImpl>::get();
106 }
107 
SetStopSpeakingWhenHidden(bool value)108 void TtsControllerImpl::SetStopSpeakingWhenHidden(bool value) {
109   stop_speaking_when_hidden_ = value;
110 }
111 
112 TtsControllerImpl::TtsControllerImpl() = default;
113 
~TtsControllerImpl()114 TtsControllerImpl::~TtsControllerImpl() {
115   if (current_utterance_) {
116     current_utterance_->Finish();
117     SetCurrentUtterance(nullptr);
118   }
119 
120   // Clear any queued utterances too.
121   ClearUtteranceQueue(false);  // Don't sent events.
122 }
123 
SpeakOrEnqueue(std::unique_ptr<TtsUtterance> utterance)124 void TtsControllerImpl::SpeakOrEnqueue(
125     std::unique_ptr<TtsUtterance> utterance) {
126   if (!ShouldSpeakUtterance(utterance.get())) {
127     utterance->Finish();
128     return;
129   }
130 
131   // If the TTS platform is still loading, queue or flush the utterance. The
132   // utterances can be sent to platform specific implementation or to the
133   // engine implementation. Every utterances are postponed until the platform
134   // specific implementation is loaded to avoid racy behaviors.
135   if (TtsPlatformLoading()) {
136     bool can_enqueue = utterance->GetCanEnqueue();
137     utterance_list_.emplace_back(std::move(utterance));
138     if (!can_enqueue)
139       ClearUtteranceQueue(true);
140     return;
141   }
142 
143   // If we're paused and we get an utterance that can't be queued,
144   // flush the queue but stay in the paused state.
145   if (paused_ && !utterance->GetCanEnqueue()) {
146     utterance_list_.emplace_back(std::move(utterance));
147     Stop();
148     paused_ = true;
149     return;
150   }
151 
152   if (paused_ || (IsSpeaking() && utterance->GetCanEnqueue())) {
153     utterance_list_.emplace_back(std::move(utterance));
154   } else {
155     Stop();
156     SpeakNow(std::move(utterance));
157   }
158 }
159 
Stop()160 void TtsControllerImpl::Stop() {
161   StopAndClearQueue(GURL());
162 }
163 
Stop(const GURL & source_url)164 void TtsControllerImpl::Stop(const GURL& source_url) {
165   StopAndClearQueue(source_url);
166 }
167 
StopAndClearQueue(const GURL & source_url)168 void TtsControllerImpl::StopAndClearQueue(const GURL& source_url) {
169   if (StopCurrentUtteranceIfMatches(source_url))
170     ClearUtteranceQueue(true);
171 }
172 
StopCurrentUtteranceIfMatches(const GURL & source_url)173 bool TtsControllerImpl::StopCurrentUtteranceIfMatches(const GURL& source_url) {
174   base::RecordAction(base::UserMetricsAction("TextToSpeech.Stop"));
175 
176   paused_ = false;
177 
178   if (!source_url.is_empty() && current_utterance_ &&
179       current_utterance_->GetSrcUrl().GetOrigin() != source_url.GetOrigin())
180     return false;
181 
182   if (current_utterance_ && !current_utterance_->GetEngineId().empty()) {
183     if (engine_delegate_)
184       engine_delegate_->Stop(current_utterance_.get());
185   } else if (TtsPlatformReady()) {
186     GetTtsPlatform()->ClearError();
187     GetTtsPlatform()->StopSpeaking();
188   }
189 
190   if (current_utterance_)
191     current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex,
192                                    kInvalidLength, std::string());
193   FinishCurrentUtterance();
194   return true;
195 }
196 
Pause()197 void TtsControllerImpl::Pause() {
198   base::RecordAction(base::UserMetricsAction("TextToSpeech.Pause"));
199   if (paused_)
200     return;
201 
202   paused_ = true;
203   if (current_utterance_ && !current_utterance_->GetEngineId().empty()) {
204     if (engine_delegate_)
205       engine_delegate_->Pause(current_utterance_.get());
206   } else if (current_utterance_) {
207     DCHECK(TtsPlatformReady());
208     GetTtsPlatform()->ClearError();
209     GetTtsPlatform()->Pause();
210   }
211 }
212 
Resume()213 void TtsControllerImpl::Resume() {
214   base::RecordAction(base::UserMetricsAction("TextToSpeech.Resume"));
215   if (!paused_)
216     return;
217 
218   paused_ = false;
219   if (current_utterance_ && !current_utterance_->GetEngineId().empty()) {
220     if (engine_delegate_)
221       engine_delegate_->Resume(current_utterance_.get());
222   } else if (current_utterance_) {
223     DCHECK(TtsPlatformReady());
224     GetTtsPlatform()->ClearError();
225     GetTtsPlatform()->Resume();
226   } else {
227     SpeakNextUtterance();
228   }
229 }
230 
OnTtsEvent(int utterance_id,TtsEventType event_type,int char_index,int length,const std::string & error_message)231 void TtsControllerImpl::OnTtsEvent(int utterance_id,
232                                    TtsEventType event_type,
233                                    int char_index,
234                                    int length,
235                                    const std::string& error_message) {
236   // We may sometimes receive completion callbacks "late", after we've
237   // already finished the utterance (for example because another utterance
238   // interrupted or we got a call to Stop). This is normal and we can
239   // safely just ignore these events.
240   if (!current_utterance_ || utterance_id != current_utterance_->GetId()) {
241     return;
242   }
243 
244   UMATextToSpeechEvent metric;
245   switch (event_type) {
246     case TTS_EVENT_START:
247       metric = UMATextToSpeechEvent::START;
248       break;
249     case TTS_EVENT_END:
250       metric = UMATextToSpeechEvent::END;
251       break;
252     case TTS_EVENT_WORD:
253       metric = UMATextToSpeechEvent::WORD;
254       break;
255     case TTS_EVENT_SENTENCE:
256       metric = UMATextToSpeechEvent::SENTENCE;
257       break;
258     case TTS_EVENT_MARKER:
259       metric = UMATextToSpeechEvent::MARKER;
260       break;
261     case TTS_EVENT_INTERRUPTED:
262       metric = UMATextToSpeechEvent::INTERRUPTED;
263       break;
264     case TTS_EVENT_CANCELLED:
265       metric = UMATextToSpeechEvent::CANCELLED;
266       break;
267     case TTS_EVENT_ERROR:
268       metric = UMATextToSpeechEvent::SPEECH_ERROR;
269       break;
270     case TTS_EVENT_PAUSE:
271       metric = UMATextToSpeechEvent::PAUSE;
272       break;
273     case TTS_EVENT_RESUME:
274       metric = UMATextToSpeechEvent::RESUME;
275       break;
276     default:
277       NOTREACHED();
278       return;
279   }
280   UMA_HISTOGRAM_ENUMERATION("TextToSpeech.Event", metric,
281                             UMATextToSpeechEvent::COUNT);
282 
283   current_utterance_->OnTtsEvent(event_type, char_index, length, error_message);
284   if (current_utterance_->IsFinished()) {
285     FinishCurrentUtterance();
286     SpeakNextUtterance();
287   }
288 }
289 
GetVoices(BrowserContext * browser_context,std::vector<VoiceData> * out_voices)290 void TtsControllerImpl::GetVoices(BrowserContext* browser_context,
291                                   std::vector<VoiceData>* out_voices) {
292   TtsPlatform* tts_platform = GetTtsPlatform();
293   DCHECK(tts_platform);
294   // Ensure we have all built-in voices loaded. This is a no-op if already
295   // loaded.
296   tts_platform->LoadBuiltInTtsEngine(browser_context);
297   if (TtsPlatformReady())
298     tts_platform->GetVoices(out_voices);
299 
300   if (browser_context && engine_delegate_)
301     engine_delegate_->GetVoices(browser_context, out_voices);
302 }
303 
IsSpeaking()304 bool TtsControllerImpl::IsSpeaking() {
305   return current_utterance_ != nullptr ||
306          (TtsPlatformReady() && GetTtsPlatform()->IsSpeaking());
307 }
308 
VoicesChanged()309 void TtsControllerImpl::VoicesChanged() {
310   if (!voices_changed_delegates_.might_have_observers() || TtsPlatformLoading())
311     return;
312 
313   // Existence of platform tts indicates explicit requests to tts. Since
314   // |VoicesChanged| can occur implicitly, only send if needed.
315   for (auto& delegate : voices_changed_delegates_)
316     delegate.OnVoicesChanged();
317 
318   if (!current_utterance_ && !utterance_list_.empty())
319     SpeakNextUtterance();
320 }
321 
AddVoicesChangedDelegate(VoicesChangedDelegate * delegate)322 void TtsControllerImpl::AddVoicesChangedDelegate(
323     VoicesChangedDelegate* delegate) {
324   voices_changed_delegates_.AddObserver(delegate);
325 }
326 
RemoveVoicesChangedDelegate(VoicesChangedDelegate * delegate)327 void TtsControllerImpl::RemoveVoicesChangedDelegate(
328     VoicesChangedDelegate* delegate) {
329   voices_changed_delegates_.RemoveObserver(delegate);
330 }
331 
RemoveUtteranceEventDelegate(UtteranceEventDelegate * delegate)332 void TtsControllerImpl::RemoveUtteranceEventDelegate(
333     UtteranceEventDelegate* delegate) {
334   // First clear any pending utterances with this delegate.
335   std::list<std::unique_ptr<TtsUtterance>> old_list;
336   utterance_list_.swap(old_list);
337   while (!old_list.empty()) {
338     std::unique_ptr<TtsUtterance> utterance = std::move(old_list.front());
339     old_list.pop_front();
340     if (utterance->GetEventDelegate() != delegate)
341       utterance_list_.emplace_back(std::move(utterance));
342   }
343 
344   if (current_utterance_ &&
345       current_utterance_->GetEventDelegate() == delegate) {
346     current_utterance_->SetEventDelegate(nullptr);
347     if (!current_utterance_->GetEngineId().empty()) {
348       if (engine_delegate_)
349         engine_delegate_->Stop(current_utterance_.get());
350     } else {
351       DCHECK(TtsPlatformReady());
352       GetTtsPlatform()->ClearError();
353       GetTtsPlatform()->StopSpeaking();
354     }
355 
356     FinishCurrentUtterance();
357     SpeakNextUtterance();
358   }
359 }
360 
SetTtsEngineDelegate(TtsEngineDelegate * delegate)361 void TtsControllerImpl::SetTtsEngineDelegate(TtsEngineDelegate* delegate) {
362   engine_delegate_ = delegate;
363 }
364 
GetTtsEngineDelegate()365 TtsEngineDelegate* TtsControllerImpl::GetTtsEngineDelegate() {
366   return engine_delegate_;
367 }
368 
Shutdown()369 void TtsControllerImpl::Shutdown() {
370   if (tts_platform_)
371     tts_platform_->Shutdown();
372 }
373 
OnBrowserContextDestroyed(BrowserContext * browser_context)374 void TtsControllerImpl::OnBrowserContextDestroyed(
375     BrowserContext* browser_context) {
376   bool did_clear_utterances = false;
377 
378   // First clear the BrowserContext from any utterances.
379   for (std::unique_ptr<TtsUtterance>& utterance : utterance_list_) {
380     if (utterance->GetBrowserContext() == browser_context) {
381       utterance->ClearBrowserContext();
382       did_clear_utterances = true;
383     }
384   }
385 
386   if (current_utterance_ &&
387       current_utterance_->GetBrowserContext() == browser_context) {
388     current_utterance_->ClearBrowserContext();
389     did_clear_utterances = true;
390   }
391 
392   // If we cleared the BrowserContext from any utterances, stop speech
393   // just to be safe. Do this using PostTask because calling Stop might
394   // try to send notifications and that can trigger code paths that try
395   // to access the BrowserContext that's being deleted. Note that it's
396   // safe to use base::Unretained because this is a singleton.
397   if (did_clear_utterances) {
398     base::ThreadTaskRunnerHandle::Get()->PostTask(
399         FROM_HERE, base::BindOnce(&TtsControllerImpl::StopAndClearQueue,
400                                   base::Unretained(this), GURL()));
401   }
402 }
403 
SetTtsPlatform(TtsPlatform * tts_platform)404 void TtsControllerImpl::SetTtsPlatform(TtsPlatform* tts_platform) {
405   tts_platform_ = tts_platform;
406 }
407 
QueueSize()408 int TtsControllerImpl::QueueSize() {
409   return static_cast<int>(utterance_list_.size());
410 }
411 
GetTtsPlatform()412 TtsPlatform* TtsControllerImpl::GetTtsPlatform() {
413   if (!tts_platform_)
414     tts_platform_ = TtsPlatform::GetInstance();
415   DCHECK(tts_platform_);
416   return tts_platform_;
417 }
418 
TtsPlatformReady()419 bool TtsControllerImpl::TtsPlatformReady() {
420   TtsPlatform* tts_platform = GetTtsPlatform();
421   return tts_platform->PlatformImplSupported() &&
422          tts_platform->PlatformImplInitialized();
423 }
424 
TtsPlatformLoading()425 bool TtsControllerImpl::TtsPlatformLoading() {
426   // If the platform implementation is supported, it is considered to be in
427   // loading state until the platform is inititialized. Typically, that means
428   // the libraries are loaded and the voices are being loaded.
429   TtsPlatform* tts_platform = GetTtsPlatform();
430   return tts_platform->PlatformImplSupported() &&
431          !tts_platform->PlatformImplInitialized();
432 }
433 
SpeakNow(std::unique_ptr<TtsUtterance> utterance)434 void TtsControllerImpl::SpeakNow(std::unique_ptr<TtsUtterance> utterance) {
435   // Get all available voices and try to find a matching voice.
436   std::vector<VoiceData> voices;
437   GetVoices(utterance->GetBrowserContext(), &voices);
438 
439   // Get the best matching voice. If nothing matches, just set "native"
440   // to true because that might trigger deferred loading of native voices.
441   // TODO(katie): Move most of the GetMatchingVoice logic into content/ and
442   // use the TTS controller delegate to get chrome-specific info as needed.
443   int index = GetMatchingVoice(utterance.get(), voices);
444   VoiceData voice;
445   if (index >= 0)
446     voice = voices[index];
447   else
448     voice.native = true;
449 
450   UpdateUtteranceDefaults(utterance.get());
451 
452   GetTtsPlatform()->WillSpeakUtteranceWithVoice(utterance.get(), voice);
453 
454   base::RecordAction(base::UserMetricsAction("TextToSpeech.Speak"));
455   UMA_HISTOGRAM_COUNTS_100000("TextToSpeech.Utterance.TextLength",
456                               utterance->GetText().size());
457   UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.FromExtensionAPI",
458                         !utterance->GetSrcUrl().is_empty());
459   UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasVoiceName",
460                         !utterance->GetVoiceName().empty());
461   UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasLang",
462                         !utterance->GetLang().empty());
463   UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasRate",
464                         utterance->GetContinuousParameters().rate != 1.0);
465   UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasPitch",
466                         utterance->GetContinuousParameters().pitch != 1.0);
467   UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasVolume",
468                         utterance->GetContinuousParameters().volume != 1.0);
469   UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.Native", voice.native);
470 
471   if (!voice.native) {
472 #if !defined(OS_ANDROID)
473     DCHECK(!voice.engine_id.empty());
474     SetCurrentUtterance(std::move(utterance));
475     current_utterance_->SetEngineId(voice.engine_id);
476     if (engine_delegate_)
477       engine_delegate_->Speak(current_utterance_.get(), voice);
478     bool sends_end_event =
479         voice.events.find(TTS_EVENT_END) != voice.events.end();
480     if (!sends_end_event) {
481       current_utterance_->Finish();
482       SetCurrentUtterance(nullptr);
483       SpeakNextUtterance();
484     }
485 #endif  // !defined(OS_ANDROID)
486   } else {
487     // It's possible for certain platforms to send start events immediately
488     // during |speak|.
489     SetCurrentUtterance(std::move(utterance));
490     if (TtsPlatformReady()) {
491       GetTtsPlatform()->ClearError();
492       GetTtsPlatform()->Speak(
493           current_utterance_->GetId(), current_utterance_->GetText(),
494           current_utterance_->GetLang(), voice,
495           current_utterance_->GetContinuousParameters(),
496           base::BindOnce(&TtsControllerImpl::OnSpeakFinished,
497                          base::Unretained(this), current_utterance_->GetId()));
498     } else {
499       // The TTS platform is not supported.
500       OnSpeakFinished(current_utterance_->GetId(), false);
501     }
502   }
503 }
504 
OnSpeakFinished(int utterance_id,bool success)505 void TtsControllerImpl::OnSpeakFinished(int utterance_id, bool success) {
506   if (success)
507     return;
508 
509   // Since OnSpeakFinished could run asynchronously, it is possible that the
510   // current utterance has changed. Ignore any such spurious callbacks.
511   if (!current_utterance_ || current_utterance_->GetId() != utterance_id)
512     return;
513 
514   // If the native voice wasn't able to process this speech, see if
515   // the browser has built-in TTS that isn't loaded yet.
516   if (GetTtsPlatform()->LoadBuiltInTtsEngine(
517           current_utterance_->GetBrowserContext())) {
518     utterance_list_.emplace_back(std::move(current_utterance_));
519     return;
520   }
521 
522   current_utterance_->OnTtsEvent(TTS_EVENT_ERROR, kInvalidCharIndex,
523                                  kInvalidLength, GetTtsPlatform()->GetError());
524   SetCurrentUtterance(nullptr);
525 }
526 
ClearUtteranceQueue(bool send_events)527 void TtsControllerImpl::ClearUtteranceQueue(bool send_events) {
528   while (!utterance_list_.empty()) {
529     std::unique_ptr<TtsUtterance> utterance =
530         std::move(utterance_list_.front());
531     utterance_list_.pop_front();
532     if (send_events) {
533       utterance->OnTtsEvent(TTS_EVENT_CANCELLED, kInvalidCharIndex,
534                             kInvalidLength, std::string());
535     } else {
536       utterance->Finish();
537     }
538   }
539 }
540 
FinishCurrentUtterance()541 void TtsControllerImpl::FinishCurrentUtterance() {
542   if (current_utterance_) {
543     if (!current_utterance_->IsFinished())
544       current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex,
545                                      kInvalidLength, std::string());
546     SetCurrentUtterance(nullptr);
547   }
548 }
549 
SpeakNextUtterance()550 void TtsControllerImpl::SpeakNextUtterance() {
551   if (paused_)
552     return;
553 
554   // Start speaking the next utterance in the queue.  Keep trying in case
555   // one fails but there are still more in the queue to try.
556   while (!utterance_list_.empty() && !current_utterance_) {
557     std::unique_ptr<TtsUtterance> utterance =
558         std::move(utterance_list_.front());
559     utterance_list_.pop_front();
560     if (ShouldSpeakUtterance(utterance.get()))
561       SpeakNow(std::move(utterance));
562     else
563       utterance->Finish();
564   }
565 }
566 
UpdateUtteranceDefaults(TtsUtterance * utterance)567 void TtsControllerImpl::UpdateUtteranceDefaults(TtsUtterance* utterance) {
568   double rate = utterance->GetContinuousParameters().rate;
569   double pitch = utterance->GetContinuousParameters().pitch;
570   double volume = utterance->GetContinuousParameters().volume;
571 #if defined(OS_CHROMEOS)
572   if (GetTtsControllerDelegate())
573     GetTtsControllerDelegate()->UpdateUtteranceDefaultsFromPrefs(
574         utterance, &rate, &pitch, &volume);
575 #else
576   // Update pitch, rate and volume to defaults if not explicity set on
577   // this utterance.
578   if (rate == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
579     rate = blink::mojom::kSpeechSynthesisDefaultRate;
580   if (pitch == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
581     pitch = blink::mojom::kSpeechSynthesisDefaultPitch;
582   if (volume == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
583     volume = blink::mojom::kSpeechSynthesisDefaultVolume;
584 #endif  // defined(OS_CHROMEOS)
585   utterance->SetContinuousParameters(rate, pitch, volume);
586 }
587 
StripSSML(const std::string & utterance,base::OnceCallback<void (const std::string &)> on_ssml_parsed)588 void TtsControllerImpl::StripSSML(
589     const std::string& utterance,
590     base::OnceCallback<void(const std::string&)> on_ssml_parsed) {
591   // Skip parsing and return if not xml.
592   if (utterance.find("<?xml") == std::string::npos) {
593     std::move(on_ssml_parsed).Run(utterance);
594     return;
595   }
596 
597   // Parse using safe, out-of-process Xml Parser.
598   data_decoder::DataDecoder::ParseXmlIsolated(
599       utterance, base::BindOnce(&TtsControllerImpl::StripSSMLHelper, utterance,
600                                 std::move(on_ssml_parsed)));
601 }
602 
603 // Called when ParseXml finishes.
604 // Uses parsed xml to build parsed utterance text.
StripSSMLHelper(const std::string & utterance,base::OnceCallback<void (const std::string &)> on_ssml_parsed,data_decoder::DataDecoder::ValueOrError result)605 void TtsControllerImpl::StripSSMLHelper(
606     const std::string& utterance,
607     base::OnceCallback<void(const std::string&)> on_ssml_parsed,
608     data_decoder::DataDecoder::ValueOrError result) {
609   // Error checks.
610   // If invalid xml, return original utterance text.
611   if (!result.value) {
612     std::move(on_ssml_parsed).Run(utterance);
613     return;
614   }
615 
616   std::string root_tag_name;
617   data_decoder::GetXmlElementTagName(*result.value, &root_tag_name);
618   // Root element must be <speak>.
619   if (root_tag_name.compare("speak") != 0) {
620     std::move(on_ssml_parsed).Run(utterance);
621     return;
622   }
623 
624   std::string parsed_text;
625   // Change from unique_ptr to base::Value* so recursion will work.
626   PopulateParsedText(&parsed_text, &(*result.value));
627 
628   // Run with parsed_text.
629   std::move(on_ssml_parsed).Run(parsed_text);
630 }
631 
PopulateParsedText(std::string * parsed_text,const base::Value * element)632 void TtsControllerImpl::PopulateParsedText(std::string* parsed_text,
633                                            const base::Value* element) {
634   DCHECK(parsed_text);
635   if (!element)
636     return;
637   // Add element's text if present.
638   // Note: We don't use data_decoder::GetXmlElementText because it gets the text
639   // of element's first child, not text of current element.
640   const base::Value* text_value = element->FindKeyOfType(
641       data_decoder::mojom::XmlParser::kTextKey, base::Value::Type::STRING);
642   if (text_value)
643     *parsed_text += text_value->GetString();
644 
645   const base::Value* children = data_decoder::GetXmlElementChildren(*element);
646   if (!children || !children->is_list())
647     return;
648 
649   for (size_t i = 0; i < children->GetList().size(); ++i) {
650     // We need to iterate over all children because some text elements are
651     // nested within other types of elements, such as <emphasis> tags.
652     PopulateParsedText(parsed_text, &children->GetList()[i]);
653   }
654 }
655 
GetMatchingVoice(TtsUtterance * utterance,const std::vector<VoiceData> & voices)656 int TtsControllerImpl::GetMatchingVoice(TtsUtterance* utterance,
657                                         const std::vector<VoiceData>& voices) {
658   const std::string app_lang =
659       GetContentClient()->browser()->GetApplicationLocale();
660   // Start with a best score of -1, that way even if none of the criteria
661   // match, something will be returned if there are any voices.
662   int best_score = -1;
663   int best_score_index = -1;
664 #if defined(OS_CHROMEOS)
665   TtsControllerDelegate* delegate = GetTtsControllerDelegate();
666   std::unique_ptr<TtsControllerDelegate::PreferredVoiceIds> preferred_ids =
667       delegate ? delegate->GetPreferredVoiceIdsForUtterance(utterance)
668                : nullptr;
669 #endif  // defined(OS_CHROMEOS)
670   for (size_t i = 0; i < voices.size(); ++i) {
671     const content::VoiceData& voice = voices[i];
672     int score = 0;
673 
674     // If the extension ID is specified, check for an exact match.
675     if (!utterance->GetEngineId().empty() &&
676         utterance->GetEngineId() != voice.engine_id)
677       continue;
678 
679     // If the voice name is specified, check for an exact match.
680     if (!utterance->GetVoiceName().empty() &&
681         voice.name != utterance->GetVoiceName())
682       continue;
683 
684     // Prefer the utterance language.
685     if (!voice.lang.empty() && !utterance->GetLang().empty()) {
686       // An exact language match is worth more than a partial match.
687       if (voice.lang == utterance->GetLang()) {
688         score += 128;
689       } else if (l10n_util::GetLanguage(voice.lang) ==
690                  l10n_util::GetLanguage(utterance->GetLang())) {
691         score += 64;
692       }
693     }
694 
695     // Next, prefer required event types.
696     if (!utterance->GetRequiredEventTypes().empty()) {
697       bool has_all_required_event_types = true;
698       for (TtsEventType event_type : utterance->GetRequiredEventTypes()) {
699         if (voice.events.find(event_type) == voice.events.end()) {
700           has_all_required_event_types = false;
701           break;
702         }
703       }
704       if (has_all_required_event_types)
705         score += 32;
706     }
707 
708 #if defined(OS_CHROMEOS)
709     if (preferred_ids) {
710       // First prefer the user's preference voice for the utterance language,
711       // if the utterance language is specified.
712       if (!utterance->GetLang().empty() &&
713           VoiceIdMatches(preferred_ids->lang_voice_id, voice)) {
714         score += 16;
715       }
716 
717       // Then prefer the user's preference voice for the system language.
718       // This is a lower priority match than the utterance voice.
719       if (VoiceIdMatches(preferred_ids->locale_voice_id, voice))
720         score += 8;
721 
722       // Finally, prefer the user's preference voice for any language. This will
723       // pick the default voice if there is no better match for the current
724       // system language and utterance language.
725       if (VoiceIdMatches(preferred_ids->any_locale_voice_id, voice))
726         score += 4;
727     }
728 #endif  // defined(OS_CHROMEOS)
729 
730     // Finally, prefer system language.
731     if (!voice.lang.empty()) {
732       if (voice.lang == app_lang) {
733         score += 2;
734       } else if (l10n_util::GetLanguage(voice.lang) ==
735                  l10n_util::GetLanguage(app_lang)) {
736         score += 1;
737       }
738     }
739 
740     if (score > best_score) {
741       best_score = score;
742       best_score_index = i;
743     }
744   }
745 
746   return best_score_index;
747 }
748 
SetCurrentUtterance(std::unique_ptr<TtsUtterance> utterance)749 void TtsControllerImpl::SetCurrentUtterance(
750     std::unique_ptr<TtsUtterance> utterance) {
751   current_utterance_ = std::move(utterance);
752   Observe(current_utterance_
753               ? AsUtteranceImpl(current_utterance_.get())->web_contents()
754               : nullptr);
755 }
756 
StopCurrentUtteranceAndRemoveUtterancesMatching(WebContents * wc)757 void TtsControllerImpl::StopCurrentUtteranceAndRemoveUtterancesMatching(
758     WebContents* wc) {
759   DCHECK(wc);
760   // Removes any utterances that match the WebContents from the current
761   // utterance (which our inherited WebContentsObserver starts observing every
762   // time the utterance changes).
763   //
764   // This is called when the WebContents for the current utterance is destroyed
765   // or hidden. In the case where it's destroyed, this is done to avoid
766   // attempting to start a utterance that is very likely to be destroyed right
767   // away, and there are also subtle timing issues if we didn't do this (if a
768   // queued utterance has already received WebContentsDestroyed(), and we start
769   // it, we won't get the corresponding WebContentsDestroyed()).
770   auto eraser = [wc](const std::unique_ptr<TtsUtterance>& utterance) {
771     TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance.get());
772     if (utterance_impl->web_contents() == wc) {
773       utterance_impl->Finish();
774       return true;
775     }
776     return false;
777   };
778   utterance_list_.erase(
779       std::remove_if(utterance_list_.begin(), utterance_list_.end(), eraser),
780       utterance_list_.end());
781   const bool stopped = StopCurrentUtteranceIfMatches(GURL());
782   DCHECK(stopped);
783   SpeakNextUtterance();
784 }
785 
ShouldSpeakUtterance(TtsUtterance * utterance)786 bool TtsControllerImpl::ShouldSpeakUtterance(TtsUtterance* utterance) {
787   TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance);
788   if (!utterance_impl->was_created_with_web_contents())
789     return true;
790 
791   // If the WebContents that created the utterance has been destroyed, don't
792   // speak it.
793   if (!utterance_impl->web_contents())
794     return false;
795 
796   // Allow speaking if either the WebContents is visible, or the WebContents
797   // isn't required to be visible before speaking.
798   return !stop_speaking_when_hidden_ ||
799          utterance_impl->web_contents()->GetVisibility() != Visibility::HIDDEN;
800 }
801 
802 //
803 // WebContentsObserver
804 //
805 
WebContentsDestroyed()806 void TtsControllerImpl::WebContentsDestroyed() {
807   StopCurrentUtteranceAndRemoveUtterancesMatching(web_contents());
808 }
809 
OnVisibilityChanged(Visibility visibility)810 void TtsControllerImpl::OnVisibilityChanged(Visibility visibility) {
811   if (visibility == Visibility::HIDDEN && stop_speaking_when_hidden_)
812     StopCurrentUtteranceAndRemoveUtterancesMatching(web_contents());
813 }
814 
815 #if defined(OS_CHROMEOS)
GetTtsControllerDelegate()816 TtsControllerDelegate* TtsControllerImpl::GetTtsControllerDelegate() {
817   if (delegate_)
818     return delegate_;
819   if (GetContentClient() && GetContentClient()->browser()) {
820     delegate_ = GetContentClient()->browser()->GetTtsControllerDelegate();
821     return delegate_;
822   }
823   return nullptr;
824 }
825 
SetTtsControllerDelegateForTesting(TtsControllerDelegate * delegate)826 void TtsControllerImpl::SetTtsControllerDelegateForTesting(
827     TtsControllerDelegate* delegate) {
828   delegate_ = delegate;
829 }
830 
831 #endif  // defined(OS_CHROMEOS)
832 
833 }  // namespace content
834