1 /*
2  * Copyright (C) 2013 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "third_party/blink/renderer/modules/speech/speech_synthesis.h"
27 
28 #include "build/build_config.h"
29 #include "third_party/blink/public/common/browser_interface_broker_proxy.h"
30 #include "third_party/blink/public/common/thread_safe_browser_interface_broker_proxy.h"
31 #include "third_party/blink/public/platform/platform.h"
32 #include "third_party/blink/renderer/bindings/modules/v8/v8_speech_synthesis_error_event_init.h"
33 #include "third_party/blink/renderer/bindings/modules/v8/v8_speech_synthesis_event_init.h"
34 #include "third_party/blink/renderer/core/dom/document.h"
35 #include "third_party/blink/renderer/core/execution_context/execution_context.h"
36 #include "third_party/blink/renderer/core/frame/deprecation.h"
37 #include "third_party/blink/renderer/core/html/media/autoplay_policy.h"
38 #include "third_party/blink/renderer/core/timing/dom_window_performance.h"
39 #include "third_party/blink/renderer/core/timing/performance.h"
40 #include "third_party/blink/renderer/modules/speech/speech_synthesis_error_event.h"
41 #include "third_party/blink/renderer/modules/speech/speech_synthesis_event.h"
42 #include "third_party/blink/renderer/platform/instrumentation/use_counter.h"
43 
44 namespace blink {
45 
Create(ExecutionContext * context)46 SpeechSynthesis* SpeechSynthesis::Create(ExecutionContext* context) {
47   SpeechSynthesis* synthesis = MakeGarbageCollected<SpeechSynthesis>(context);
48 #if defined(OS_ANDROID)
49   // On Android devices we lazily initialize |mojom_synthesis_| to avoid
50   // needlessly binding to the TTS service, see https://crbug.com/811929.
51   // TODO(crbug/811929): Consider moving this logic into the Android-
52   // specific backend implementation.
53 #else
54   synthesis->InitializeMojomSynthesis();
55 #endif
56   return synthesis;
57 }
58 
CreateForTesting(ExecutionContext * context,mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis)59 SpeechSynthesis* SpeechSynthesis::CreateForTesting(
60     ExecutionContext* context,
61     mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis) {
62   SpeechSynthesis* synthesis = MakeGarbageCollected<SpeechSynthesis>(context);
63   synthesis->SetMojomSynthesisForTesting(std::move(mojom_synthesis));
64   return synthesis;
65 }
66 
SpeechSynthesis(ExecutionContext * context)67 SpeechSynthesis::SpeechSynthesis(ExecutionContext* context)
68     : ExecutionContextClient(context),
69       receiver_(this, context),
70       mojom_synthesis_(context) {
71   DCHECK(!GetExecutionContext() || GetExecutionContext()->IsDocument());
72 }
73 
OnSetVoiceList(Vector<mojom::blink::SpeechSynthesisVoicePtr> mojom_voices)74 void SpeechSynthesis::OnSetVoiceList(
75     Vector<mojom::blink::SpeechSynthesisVoicePtr> mojom_voices) {
76   voice_list_.clear();
77   for (auto& mojom_voice : mojom_voices) {
78     voice_list_.push_back(
79         MakeGarbageCollected<SpeechSynthesisVoice>(std::move(mojom_voice)));
80   }
81   VoicesDidChange();
82 }
83 
getVoices()84 const HeapVector<Member<SpeechSynthesisVoice>>& SpeechSynthesis::getVoices() {
85   // Kick off initialization here to ensure voice list gets populated.
86   InitializeMojomSynthesisIfNeeded();
87   return voice_list_;
88 }
89 
speaking() const90 bool SpeechSynthesis::speaking() const {
91   // If we have a current speech utterance, then that means we're assumed to be
92   // in a speaking state. This state is independent of whether the utterance
93   // happens to be paused.
94   return CurrentSpeechUtterance();
95 }
96 
pending() const97 bool SpeechSynthesis::pending() const {
98   // This is true if there are any utterances that have not started.
99   // That means there will be more than one in the queue.
100   return utterance_queue_.size() > 1;
101 }
102 
paused() const103 bool SpeechSynthesis::paused() const {
104   return is_paused_;
105 }
106 
speak(SpeechSynthesisUtterance * utterance)107 void SpeechSynthesis::speak(SpeechSynthesisUtterance* utterance) {
108   DCHECK(utterance);
109   Document* document = Document::From(GetExecutionContext());
110   if (!document)
111     return;
112 
113   // Note: Non-UseCounter based TTS metrics are of the form TextToSpeech.* and
114   // are generally global, whereas these are scoped to a single page load.
115   UseCounter::Count(document, WebFeature::kTextToSpeech_Speak);
116   document->CountUseOnlyInCrossOriginIframe(
117       WebFeature::kTextToSpeech_SpeakCrossOrigin);
118   if (!IsAllowedToStartByAutoplay()) {
119     Deprecation::CountDeprecation(
120         document, WebFeature::kTextToSpeech_SpeakDisallowedByAutoplay);
121     FireErrorEvent(utterance, 0 /* char_index */, "not-allowed");
122     return;
123   }
124 
125   utterance_queue_.push_back(utterance);
126 
127   // If the queue was empty, speak this immediately.
128   if (utterance_queue_.size() == 1)
129     StartSpeakingImmediately();
130 }
131 
cancel()132 void SpeechSynthesis::cancel() {
133   // Remove all the items from the utterance queue. The platform
134   // may still have references to some of these utterances and may
135   // fire events on them asynchronously.
136   utterance_queue_.clear();
137 
138   InitializeMojomSynthesisIfNeeded();
139   mojom_synthesis_->Cancel();
140 }
141 
pause()142 void SpeechSynthesis::pause() {
143   if (is_paused_)
144     return;
145 
146   InitializeMojomSynthesisIfNeeded();
147   mojom_synthesis_->Pause();
148 }
149 
resume()150 void SpeechSynthesis::resume() {
151   if (!CurrentSpeechUtterance())
152     return;
153 
154   InitializeMojomSynthesisIfNeeded();
155   mojom_synthesis_->Resume();
156 }
157 
DidStartSpeaking(SpeechSynthesisUtterance * utterance)158 void SpeechSynthesis::DidStartSpeaking(SpeechSynthesisUtterance* utterance) {
159   FireEvent(event_type_names::kStart, utterance, 0, 0, String());
160 }
161 
DidPauseSpeaking(SpeechSynthesisUtterance * utterance)162 void SpeechSynthesis::DidPauseSpeaking(SpeechSynthesisUtterance* utterance) {
163   is_paused_ = true;
164   FireEvent(event_type_names::kPause, utterance, 0, 0, String());
165 }
166 
DidResumeSpeaking(SpeechSynthesisUtterance * utterance)167 void SpeechSynthesis::DidResumeSpeaking(SpeechSynthesisUtterance* utterance) {
168   is_paused_ = false;
169   FireEvent(event_type_names::kResume, utterance, 0, 0, String());
170 }
171 
DidFinishSpeaking(SpeechSynthesisUtterance * utterance)172 void SpeechSynthesis::DidFinishSpeaking(SpeechSynthesisUtterance* utterance) {
173   HandleSpeakingCompleted(utterance, false);
174 }
175 
SpeakingErrorOccurred(SpeechSynthesisUtterance * utterance)176 void SpeechSynthesis::SpeakingErrorOccurred(
177     SpeechSynthesisUtterance* utterance) {
178   HandleSpeakingCompleted(utterance, true);
179 }
180 
WordBoundaryEventOccurred(SpeechSynthesisUtterance * utterance,unsigned char_index,unsigned char_length)181 void SpeechSynthesis::WordBoundaryEventOccurred(
182     SpeechSynthesisUtterance* utterance,
183     unsigned char_index,
184     unsigned char_length) {
185   DEFINE_STATIC_LOCAL(const String, word_boundary_string, ("word"));
186   FireEvent(event_type_names::kBoundary, utterance, char_index, char_length,
187             word_boundary_string);
188 }
189 
SentenceBoundaryEventOccurred(SpeechSynthesisUtterance * utterance,unsigned char_index,unsigned char_length)190 void SpeechSynthesis::SentenceBoundaryEventOccurred(
191     SpeechSynthesisUtterance* utterance,
192     unsigned char_index,
193     unsigned char_length) {
194   DEFINE_STATIC_LOCAL(const String, sentence_boundary_string, ("sentence"));
195   FireEvent(event_type_names::kBoundary, utterance, char_index, char_length,
196             sentence_boundary_string);
197 }
198 
VoicesDidChange()199 void SpeechSynthesis::VoicesDidChange() {
200   if (GetExecutionContext())
201     DispatchEvent(*Event::Create(event_type_names::kVoiceschanged));
202 }
203 
StartSpeakingImmediately()204 void SpeechSynthesis::StartSpeakingImmediately() {
205   SpeechSynthesisUtterance* utterance = CurrentSpeechUtterance();
206   DCHECK(utterance);
207 
208   double millis;
209   if (!GetElapsedTimeMillis(&millis))
210     return;
211 
212   utterance->SetStartTime(millis / 1000.0);
213   is_paused_ = false;
214 
215   InitializeMojomSynthesisIfNeeded();
216   utterance->Start(this);
217 }
218 
HandleSpeakingCompleted(SpeechSynthesisUtterance * utterance,bool error_occurred)219 void SpeechSynthesis::HandleSpeakingCompleted(
220     SpeechSynthesisUtterance* utterance,
221     bool error_occurred) {
222   DCHECK(utterance);
223 
224   bool should_start_speaking = false;
225   // If the utterance that completed was the one we're currently speaking,
226   // remove it from the queue and start speaking the next one.
227   if (utterance == CurrentSpeechUtterance()) {
228     utterance_queue_.pop_front();
229     should_start_speaking = !utterance_queue_.empty();
230   }
231 
232   // Always fire the event, because the platform may have asynchronously
233   // sent an event on an utterance before it got the message that we
234   // canceled it, and we should always report to the user what actually
235   // happened.
236   if (error_occurred) {
237     // TODO(csharrison): Actually pass the correct message. For now just use a
238     // generic error.
239     FireErrorEvent(utterance, 0, "synthesis-failed");
240   } else {
241     FireEvent(event_type_names::kEnd, utterance, 0, 0, String());
242   }
243 
244   // Start the next utterance if we just finished one and one was pending.
245   if (should_start_speaking && !utterance_queue_.IsEmpty())
246     StartSpeakingImmediately();
247 }
248 
FireEvent(const AtomicString & type,SpeechSynthesisUtterance * utterance,uint32_t char_index,uint32_t char_length,const String & name)249 void SpeechSynthesis::FireEvent(const AtomicString& type,
250                                 SpeechSynthesisUtterance* utterance,
251                                 uint32_t char_index,
252                                 uint32_t char_length,
253                                 const String& name) {
254   double millis;
255   if (!GetElapsedTimeMillis(&millis))
256     return;
257 
258   SpeechSynthesisEventInit* init = SpeechSynthesisEventInit::Create();
259   init->setUtterance(utterance);
260   init->setCharIndex(char_index);
261   init->setCharLength(char_length);
262   init->setElapsedTime(millis - (utterance->StartTime() * 1000.0));
263   init->setName(name);
264   utterance->DispatchEvent(*SpeechSynthesisEvent::Create(type, init));
265 }
266 
FireErrorEvent(SpeechSynthesisUtterance * utterance,uint32_t char_index,const String & error)267 void SpeechSynthesis::FireErrorEvent(SpeechSynthesisUtterance* utterance,
268                                      uint32_t char_index,
269                                      const String& error) {
270   double millis;
271   if (!GetElapsedTimeMillis(&millis))
272     return;
273 
274   SpeechSynthesisErrorEventInit* init = SpeechSynthesisErrorEventInit::Create();
275   init->setUtterance(utterance);
276   init->setCharIndex(char_index);
277   init->setElapsedTime(millis - (utterance->StartTime() * 1000.0));
278   init->setError(error);
279   utterance->DispatchEvent(
280       *SpeechSynthesisErrorEvent::Create(event_type_names::kError, init));
281 }
282 
CurrentSpeechUtterance() const283 SpeechSynthesisUtterance* SpeechSynthesis::CurrentSpeechUtterance() const {
284   if (utterance_queue_.IsEmpty())
285     return nullptr;
286 
287   return utterance_queue_.front();
288 }
289 
Trace(Visitor * visitor)290 void SpeechSynthesis::Trace(Visitor* visitor) {
291   visitor->Trace(receiver_);
292   visitor->Trace(mojom_synthesis_);
293   visitor->Trace(voice_list_);
294   visitor->Trace(utterance_queue_);
295   ExecutionContextClient::Trace(visitor);
296   EventTargetWithInlineData::Trace(visitor);
297 }
298 
GetElapsedTimeMillis(double * millis)299 bool SpeechSynthesis::GetElapsedTimeMillis(double* millis) {
300   if (!GetExecutionContext())
301     return false;
302   Document* delegate_document = Document::From(GetExecutionContext());
303   if (!delegate_document || delegate_document->IsStopped())
304     return false;
305   LocalDOMWindow* delegate_dom_window = delegate_document->domWindow();
306   if (!delegate_dom_window)
307     return false;
308 
309   *millis = DOMWindowPerformance::performance(*delegate_dom_window)->now();
310   return true;
311 }
312 
IsAllowedToStartByAutoplay() const313 bool SpeechSynthesis::IsAllowedToStartByAutoplay() const {
314   Document* document = Document::From(GetExecutionContext());
315   DCHECK(document);
316 
317   // Note: could check the utterance->volume here, but that could be overriden
318   // in the case of SSML.
319   if (AutoplayPolicy::GetAutoplayPolicyForDocument(*document) !=
320       AutoplayPolicy::Type::kDocumentUserActivationRequired) {
321     return true;
322   }
323   return AutoplayPolicy::IsDocumentAllowedToPlay(*document);
324 }
325 
SetMojomSynthesisForTesting(mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis)326 void SpeechSynthesis::SetMojomSynthesisForTesting(
327     mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis) {
328   mojom_synthesis_.Bind(
329       std::move(mojom_synthesis),
330       GetExecutionContext()->GetTaskRunner(TaskType::kMiscPlatformAPI));
331   receiver_.reset();
332   mojom_synthesis_->AddVoiceListObserver(receiver_.BindNewPipeAndPassRemote(
333       GetExecutionContext()->GetTaskRunner(TaskType::kMiscPlatformAPI)));
334 }
335 
InitializeMojomSynthesis()336 void SpeechSynthesis::InitializeMojomSynthesis() {
337   DCHECK(!mojom_synthesis_.is_bound());
338 
339   // The frame could be detached. In that case, calls on mojom_synthesis_ will
340   // just get dropped. That's okay and is simpler than having to null-check
341   // mojom_synthesis_ before each use.
342   ExecutionContext* context = GetExecutionContext();
343 
344   if (!context)
345     return;
346 
347   auto receiver = mojom_synthesis_.BindNewPipeAndPassReceiver(
348       context->GetTaskRunner(TaskType::kMiscPlatformAPI));
349 
350   context->GetBrowserInterfaceBroker().GetInterface(std::move(receiver));
351 
352   mojom_synthesis_->AddVoiceListObserver(receiver_.BindNewPipeAndPassRemote(
353       context->GetTaskRunner(TaskType::kMiscPlatformAPI)));
354 }
355 
InitializeMojomSynthesisIfNeeded()356 void SpeechSynthesis::InitializeMojomSynthesisIfNeeded() {
357   if (!mojom_synthesis_.is_bound())
358     InitializeMojomSynthesis();
359 }
360 
InterfaceName() const361 const AtomicString& SpeechSynthesis::InterfaceName() const {
362   return event_target_names::kSpeechSynthesis;
363 }
364 
365 }  // namespace blink
366