1 /*
2  * Copyright (C) 2013 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "third_party/blink/renderer/modules/speech/speech_synthesis.h"
27 
28 #include "build/build_config.h"
29 #include "third_party/blink/public/common/browser_interface_broker_proxy.h"
30 #include "third_party/blink/public/common/privacy_budget/identifiability_metric_builder.h"
31 #include "third_party/blink/public/common/privacy_budget/identifiability_study_settings.h"
32 #include "third_party/blink/public/common/privacy_budget/identifiable_token.h"
33 #include "third_party/blink/public/common/privacy_budget/identifiable_token_builder.h"
34 #include "third_party/blink/public/common/thread_safe_browser_interface_broker_proxy.h"
35 #include "third_party/blink/public/platform/platform.h"
36 #include "third_party/blink/renderer/bindings/modules/v8/v8_speech_synthesis_error_event_init.h"
37 #include "third_party/blink/renderer/bindings/modules/v8/v8_speech_synthesis_event_init.h"
38 #include "third_party/blink/renderer/core/dom/document.h"
39 #include "third_party/blink/renderer/core/frame/deprecation.h"
40 #include "third_party/blink/renderer/core/frame/local_dom_window.h"
41 #include "third_party/blink/renderer/core/frame/web_feature.h"
42 #include "third_party/blink/renderer/core/html/media/autoplay_policy.h"
43 #include "third_party/blink/renderer/core/timing/dom_window_performance.h"
44 #include "third_party/blink/renderer/core/timing/performance.h"
45 #include "third_party/blink/renderer/modules/speech/speech_synthesis_error_event.h"
46 #include "third_party/blink/renderer/modules/speech/speech_synthesis_event.h"
47 #include "third_party/blink/renderer/modules/speech/speech_synthesis_voice.h"
48 #include "third_party/blink/renderer/platform/instrumentation/use_counter.h"
49 #include "third_party/blink/renderer/platform/privacy_budget/identifiability_digest_helpers.h"
50 
51 namespace blink {
52 
53 const char SpeechSynthesis::kSupplementName[] = "SpeechSynthesis";
54 
speechSynthesis(LocalDOMWindow & window)55 SpeechSynthesis* SpeechSynthesis::speechSynthesis(LocalDOMWindow& window) {
56   SpeechSynthesis* synthesis =
57       Supplement<LocalDOMWindow>::From<SpeechSynthesis>(window);
58   if (!synthesis) {
59     synthesis = MakeGarbageCollected<SpeechSynthesis>(window);
60     ProvideTo(window, synthesis);
61 #if defined(OS_ANDROID)
62     // On Android devices we lazily initialize |mojom_synthesis_| to avoid
63     // needlessly binding to the TTS service, see https://crbug.com/811929.
64     // TODO(crbug/811929): Consider moving this logic into the Android-
65     // specific backend implementation.
66 #else
67     ignore_result(synthesis->TryEnsureMojomSynthesis());
68 #endif
69   }
70   return synthesis;
71 }
72 
CreateForTesting(LocalDOMWindow & window,mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis)73 void SpeechSynthesis::CreateForTesting(
74     LocalDOMWindow& window,
75     mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis) {
76   DCHECK(!Supplement<LocalDOMWindow>::From<SpeechSynthesis>(window));
77   SpeechSynthesis* synthesis = MakeGarbageCollected<SpeechSynthesis>(window);
78   ProvideTo(window, synthesis);
79   synthesis->SetMojomSynthesisForTesting(std::move(mojom_synthesis));
80 }
81 
SpeechSynthesis(LocalDOMWindow & window)82 SpeechSynthesis::SpeechSynthesis(LocalDOMWindow& window)
83     : Supplement<LocalDOMWindow>(window),
84       receiver_(this, &window),
85       mojom_synthesis_(&window) {}
86 
OnSetVoiceList(Vector<mojom::blink::SpeechSynthesisVoicePtr> mojom_voices)87 void SpeechSynthesis::OnSetVoiceList(
88     Vector<mojom::blink::SpeechSynthesisVoicePtr> mojom_voices) {
89   voice_list_.clear();
90   for (auto& mojom_voice : mojom_voices) {
91     voice_list_.push_back(
92         MakeGarbageCollected<SpeechSynthesisVoice>(std::move(mojom_voice)));
93   }
94   VoicesDidChange();
95 }
96 
getVoices()97 const HeapVector<Member<SpeechSynthesisVoice>>& SpeechSynthesis::getVoices() {
98   // Kick off initialization here to ensure voice list gets populated.
99   ignore_result(TryEnsureMojomSynthesis());
100   RecordVoicesForIdentifiability();
101   return voice_list_;
102 }
103 
RecordVoicesForIdentifiability() const104 void SpeechSynthesis::RecordVoicesForIdentifiability() const {
105   constexpr IdentifiableSurface surface = IdentifiableSurface::FromTypeAndToken(
106       IdentifiableSurface::Type::kWebFeature,
107       WebFeature::kSpeechSynthesis_GetVoices_Method);
108   if (!IdentifiabilityStudySettings::Get()->ShouldSample(surface))
109     return;
110   if (!GetSupplementable()->GetFrame())
111     return;
112 
113   IdentifiableTokenBuilder builder;
114   for (const auto& voice : voice_list_) {
115     builder.AddToken(IdentifiabilityBenignStringToken(voice->voiceURI()));
116     builder.AddToken(IdentifiabilityBenignStringToken(voice->lang()));
117     builder.AddToken(IdentifiabilityBenignStringToken(voice->name()));
118     builder.AddToken(voice->localService());
119   }
120   IdentifiabilityMetricBuilder(GetSupplementable()->UkmSourceID())
121       .Set(surface, builder.GetToken())
122       .Record(GetSupplementable()->UkmRecorder());
123 }
124 
speaking() const125 bool SpeechSynthesis::speaking() const {
126   // If we have a current speech utterance, then that means we're assumed to be
127   // in a speaking state. This state is independent of whether the utterance
128   // happens to be paused.
129   return CurrentSpeechUtterance();
130 }
131 
pending() const132 bool SpeechSynthesis::pending() const {
133   // This is true if there are any utterances that have not started.
134   // That means there will be more than one in the queue.
135   return utterance_queue_.size() > 1;
136 }
137 
paused() const138 bool SpeechSynthesis::paused() const {
139   return is_paused_;
140 }
141 
speak(ScriptState * script_state,SpeechSynthesisUtterance * utterance)142 void SpeechSynthesis::speak(ScriptState* script_state,
143                             SpeechSynthesisUtterance* utterance) {
144   DCHECK(utterance);
145   if (!script_state->ContextIsValid())
146     return;
147 
148   // Note: Non-UseCounter based TTS metrics are of the form TextToSpeech.* and
149   // are generally global, whereas these are scoped to a single page load.
150   UseCounter::Count(GetSupplementable(), WebFeature::kTextToSpeech_Speak);
151   GetSupplementable()->CountUseOnlyInCrossOriginIframe(
152       WebFeature::kTextToSpeech_SpeakCrossOrigin);
153   if (!IsAllowedToStartByAutoplay()) {
154     Deprecation::CountDeprecation(
155         GetSupplementable(),
156         WebFeature::kTextToSpeech_SpeakDisallowedByAutoplay);
157     FireErrorEvent(utterance, 0 /* char_index */, "not-allowed");
158     return;
159   }
160 
161   utterance_queue_.push_back(utterance);
162 
163   // If the queue was empty, speak this immediately.
164   if (utterance_queue_.size() == 1)
165     StartSpeakingImmediately();
166 }
167 
cancel()168 void SpeechSynthesis::cancel() {
169   // Remove all the items from the utterance queue. The platform
170   // may still have references to some of these utterances and may
171   // fire events on them asynchronously.
172   utterance_queue_.clear();
173 
174   if (mojom::blink::SpeechSynthesis* mojom_synthesis =
175           TryEnsureMojomSynthesis())
176     mojom_synthesis->Cancel();
177 }
178 
pause()179 void SpeechSynthesis::pause() {
180   if (is_paused_)
181     return;
182 
183   if (mojom::blink::SpeechSynthesis* mojom_synthesis =
184           TryEnsureMojomSynthesis())
185     mojom_synthesis->Pause();
186 }
187 
resume()188 void SpeechSynthesis::resume() {
189   if (!CurrentSpeechUtterance())
190     return;
191 
192   if (mojom::blink::SpeechSynthesis* mojom_synthesis =
193           TryEnsureMojomSynthesis())
194     mojom_synthesis->Resume();
195 }
196 
DidStartSpeaking(SpeechSynthesisUtterance * utterance)197 void SpeechSynthesis::DidStartSpeaking(SpeechSynthesisUtterance* utterance) {
198   FireEvent(event_type_names::kStart, utterance, 0, 0, String());
199 }
200 
DidPauseSpeaking(SpeechSynthesisUtterance * utterance)201 void SpeechSynthesis::DidPauseSpeaking(SpeechSynthesisUtterance* utterance) {
202   is_paused_ = true;
203   FireEvent(event_type_names::kPause, utterance, 0, 0, String());
204 }
205 
DidResumeSpeaking(SpeechSynthesisUtterance * utterance)206 void SpeechSynthesis::DidResumeSpeaking(SpeechSynthesisUtterance* utterance) {
207   is_paused_ = false;
208   FireEvent(event_type_names::kResume, utterance, 0, 0, String());
209 }
210 
DidFinishSpeaking(SpeechSynthesisUtterance * utterance)211 void SpeechSynthesis::DidFinishSpeaking(SpeechSynthesisUtterance* utterance) {
212   HandleSpeakingCompleted(utterance, false);
213 }
214 
SpeakingErrorOccurred(SpeechSynthesisUtterance * utterance)215 void SpeechSynthesis::SpeakingErrorOccurred(
216     SpeechSynthesisUtterance* utterance) {
217   HandleSpeakingCompleted(utterance, true);
218 }
219 
WordBoundaryEventOccurred(SpeechSynthesisUtterance * utterance,unsigned char_index,unsigned char_length)220 void SpeechSynthesis::WordBoundaryEventOccurred(
221     SpeechSynthesisUtterance* utterance,
222     unsigned char_index,
223     unsigned char_length) {
224   DEFINE_STATIC_LOCAL(const String, word_boundary_string, ("word"));
225   FireEvent(event_type_names::kBoundary, utterance, char_index, char_length,
226             word_boundary_string);
227 }
228 
SentenceBoundaryEventOccurred(SpeechSynthesisUtterance * utterance,unsigned char_index,unsigned char_length)229 void SpeechSynthesis::SentenceBoundaryEventOccurred(
230     SpeechSynthesisUtterance* utterance,
231     unsigned char_index,
232     unsigned char_length) {
233   DEFINE_STATIC_LOCAL(const String, sentence_boundary_string, ("sentence"));
234   FireEvent(event_type_names::kBoundary, utterance, char_index, char_length,
235             sentence_boundary_string);
236 }
237 
VoicesDidChange()238 void SpeechSynthesis::VoicesDidChange() {
239   if (GetSupplementable()->GetFrame())
240     DispatchEvent(*Event::Create(event_type_names::kVoiceschanged));
241 }
242 
StartSpeakingImmediately()243 void SpeechSynthesis::StartSpeakingImmediately() {
244   SpeechSynthesisUtterance* utterance = CurrentSpeechUtterance();
245   DCHECK(utterance);
246 
247   double millis;
248   if (!GetElapsedTimeMillis(&millis))
249     return;
250 
251   utterance->SetStartTime(millis / 1000.0);
252   is_paused_ = false;
253 
254   if (TryEnsureMojomSynthesis())
255     utterance->Start(this);
256 }
257 
HandleSpeakingCompleted(SpeechSynthesisUtterance * utterance,bool error_occurred)258 void SpeechSynthesis::HandleSpeakingCompleted(
259     SpeechSynthesisUtterance* utterance,
260     bool error_occurred) {
261   DCHECK(utterance);
262 
263   bool should_start_speaking = false;
264   // If the utterance that completed was the one we're currently speaking,
265   // remove it from the queue and start speaking the next one.
266   if (utterance == CurrentSpeechUtterance()) {
267     utterance_queue_.pop_front();
268     should_start_speaking = !utterance_queue_.empty();
269   }
270 
271   // Always fire the event, because the platform may have asynchronously
272   // sent an event on an utterance before it got the message that we
273   // canceled it, and we should always report to the user what actually
274   // happened.
275   if (error_occurred) {
276     // TODO(csharrison): Actually pass the correct message. For now just use a
277     // generic error.
278     FireErrorEvent(utterance, 0, "synthesis-failed");
279   } else {
280     FireEvent(event_type_names::kEnd, utterance, 0, 0, String());
281   }
282 
283   // Start the next utterance if we just finished one and one was pending.
284   if (should_start_speaking && !utterance_queue_.IsEmpty())
285     StartSpeakingImmediately();
286 }
287 
FireEvent(const AtomicString & type,SpeechSynthesisUtterance * utterance,uint32_t char_index,uint32_t char_length,const String & name)288 void SpeechSynthesis::FireEvent(const AtomicString& type,
289                                 SpeechSynthesisUtterance* utterance,
290                                 uint32_t char_index,
291                                 uint32_t char_length,
292                                 const String& name) {
293   double millis;
294   if (!GetElapsedTimeMillis(&millis))
295     return;
296 
297   SpeechSynthesisEventInit* init = SpeechSynthesisEventInit::Create();
298   init->setUtterance(utterance);
299   init->setCharIndex(char_index);
300   init->setCharLength(char_length);
301   init->setElapsedTime(millis - (utterance->StartTime() * 1000.0));
302   init->setName(name);
303   utterance->DispatchEvent(*SpeechSynthesisEvent::Create(type, init));
304 }
305 
FireErrorEvent(SpeechSynthesisUtterance * utterance,uint32_t char_index,const String & error)306 void SpeechSynthesis::FireErrorEvent(SpeechSynthesisUtterance* utterance,
307                                      uint32_t char_index,
308                                      const String& error) {
309   double millis;
310   if (!GetElapsedTimeMillis(&millis))
311     return;
312 
313   SpeechSynthesisErrorEventInit* init = SpeechSynthesisErrorEventInit::Create();
314   init->setUtterance(utterance);
315   init->setCharIndex(char_index);
316   init->setElapsedTime(millis - (utterance->StartTime() * 1000.0));
317   init->setError(error);
318   utterance->DispatchEvent(
319       *SpeechSynthesisErrorEvent::Create(event_type_names::kError, init));
320 }
321 
CurrentSpeechUtterance() const322 SpeechSynthesisUtterance* SpeechSynthesis::CurrentSpeechUtterance() const {
323   if (utterance_queue_.IsEmpty())
324     return nullptr;
325 
326   return utterance_queue_.front();
327 }
328 
GetExecutionContext() const329 ExecutionContext* SpeechSynthesis::GetExecutionContext() const {
330   return GetSupplementable();
331 }
332 
Trace(Visitor * visitor) const333 void SpeechSynthesis::Trace(Visitor* visitor) const {
334   visitor->Trace(receiver_);
335   visitor->Trace(mojom_synthesis_);
336   visitor->Trace(voice_list_);
337   visitor->Trace(utterance_queue_);
338   Supplement<LocalDOMWindow>::Trace(visitor);
339   EventTargetWithInlineData::Trace(visitor);
340 }
341 
GetElapsedTimeMillis(double * millis)342 bool SpeechSynthesis::GetElapsedTimeMillis(double* millis) {
343   if (!GetSupplementable()->GetFrame())
344     return false;
345   if (GetSupplementable()->document()->IsStopped())
346     return false;
347 
348   *millis = DOMWindowPerformance::performance(*GetSupplementable())->now();
349   return true;
350 }
351 
IsAllowedToStartByAutoplay() const352 bool SpeechSynthesis::IsAllowedToStartByAutoplay() const {
353   Document* document = GetSupplementable()->document();
354   DCHECK(document);
355 
356   // Note: could check the utterance->volume here, but that could be overriden
357   // in the case of SSML.
358   if (AutoplayPolicy::GetAutoplayPolicyForDocument(*document) !=
359       AutoplayPolicy::Type::kDocumentUserActivationRequired) {
360     return true;
361   }
362   return AutoplayPolicy::IsDocumentAllowedToPlay(*document);
363 }
364 
SetMojomSynthesisForTesting(mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis)365 void SpeechSynthesis::SetMojomSynthesisForTesting(
366     mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis) {
367   mojom_synthesis_.Bind(
368       std::move(mojom_synthesis),
369       GetSupplementable()->GetTaskRunner(TaskType::kMiscPlatformAPI));
370   receiver_.reset();
371   mojom_synthesis_->AddVoiceListObserver(receiver_.BindNewPipeAndPassRemote(
372       GetSupplementable()->GetTaskRunner(TaskType::kMiscPlatformAPI)));
373 }
374 
TryEnsureMojomSynthesis()375 mojom::blink::SpeechSynthesis* SpeechSynthesis::TryEnsureMojomSynthesis() {
376   if (mojom_synthesis_.is_bound())
377     return mojom_synthesis_.get();
378 
379   // The frame could be detached. In that case, calls on mojom_synthesis_ will
380   // just get dropped. That's okay and is simpler than having to null-check
381   // mojom_synthesis_ before each use.
382   LocalDOMWindow* window = GetSupplementable();
383   if (!window->GetFrame())
384     return nullptr;
385 
386   auto receiver = mojom_synthesis_.BindNewPipeAndPassReceiver(
387       window->GetTaskRunner(TaskType::kMiscPlatformAPI));
388 
389   window->GetBrowserInterfaceBroker().GetInterface(std::move(receiver));
390 
391   mojom_synthesis_->AddVoiceListObserver(receiver_.BindNewPipeAndPassRemote(
392       window->GetTaskRunner(TaskType::kMiscPlatformAPI)));
393   return mojom_synthesis_.get();
394 }
395 
InterfaceName() const396 const AtomicString& SpeechSynthesis::InterfaceName() const {
397   return event_target_names::kSpeechSynthesis;
398 }
399 
400 }  // namespace blink
401