1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "SpeechDispatcherService.h"
8 
9 #include "mozilla/dom/nsSpeechTask.h"
10 #include "mozilla/dom/nsSynthVoiceRegistry.h"
11 #include "mozilla/ClearOnShutdown.h"
12 #include "mozilla/Preferences.h"
13 #include "mozilla/StaticPrefs_media.h"
14 #include "nsEscape.h"
15 #include "nsISupports.h"
16 #include "nsPrintfCString.h"
17 #include "nsReadableUtils.h"
18 #include "nsServiceManagerUtils.h"
19 #include "nsThreadUtils.h"
20 #include "nsXULAppAPI.h"
21 #include "prlink.h"
22 
23 #include <math.h>
24 #include <stdlib.h>
25 
26 #define URI_PREFIX "urn:moz-tts:speechd:"
27 
28 #define MAX_RATE static_cast<float>(2.5)
29 #define MIN_RATE static_cast<float>(0.5)
30 
31 // Some structures for libspeechd
32 typedef enum {
33   SPD_EVENT_BEGIN,
34   SPD_EVENT_END,
35   SPD_EVENT_INDEX_MARK,
36   SPD_EVENT_CANCEL,
37   SPD_EVENT_PAUSE,
38   SPD_EVENT_RESUME
39 } SPDNotificationType;
40 
41 typedef enum {
42   SPD_BEGIN = 1,
43   SPD_END = 2,
44   SPD_INDEX_MARKS = 4,
45   SPD_CANCEL = 8,
46   SPD_PAUSE = 16,
47   SPD_RESUME = 32,
48 
49   SPD_ALL = 0x3f
50 } SPDNotification;
51 
52 typedef enum { SPD_MODE_SINGLE = 0, SPD_MODE_THREADED = 1 } SPDConnectionMode;
53 
54 typedef void (*SPDCallback)(size_t msg_id, size_t client_id,
55                             SPDNotificationType state);
56 
57 typedef void (*SPDCallbackIM)(size_t msg_id, size_t client_id,
58                               SPDNotificationType state, char* index_mark);
59 
60 struct SPDConnection {
61   SPDCallback callback_begin;
62   SPDCallback callback_end;
63   SPDCallback callback_cancel;
64   SPDCallback callback_pause;
65   SPDCallback callback_resume;
66   SPDCallbackIM callback_im;
67 
68   /* partial, more private fields in structure */
69 };
70 
71 struct SPDVoice {
72   char* name;
73   char* language;
74   char* variant;
75 };
76 
77 typedef enum {
78   SPD_IMPORTANT = 1,
79   SPD_MESSAGE = 2,
80   SPD_TEXT = 3,
81   SPD_NOTIFICATION = 4,
82   SPD_PROGRESS = 5
83 } SPDPriority;
84 
85 #define SPEECHD_FUNCTIONS                                           \
86   FUNC(spd_open, SPDConnection*,                                    \
87        (const char*, const char*, const char*, SPDConnectionMode))  \
88   FUNC(spd_close, void, (SPDConnection*))                           \
89   FUNC(spd_list_synthesis_voices, SPDVoice**, (SPDConnection*))     \
90   FUNC(spd_say, int, (SPDConnection*, SPDPriority, const char*))    \
91   FUNC(spd_cancel, int, (SPDConnection*))                           \
92   FUNC(spd_set_volume, int, (SPDConnection*, int))                  \
93   FUNC(spd_set_voice_rate, int, (SPDConnection*, int))              \
94   FUNC(spd_set_voice_pitch, int, (SPDConnection*, int))             \
95   FUNC(spd_set_synthesis_voice, int, (SPDConnection*, const char*)) \
96   FUNC(spd_set_notification_on, int, (SPDConnection*, SPDNotification))
97 
98 #define FUNC(name, type, params)      \
99   typedef type(*_##name##_fn) params; \
100   static _##name##_fn _##name;
101 
102 SPEECHD_FUNCTIONS
103 
104 #undef FUNC
105 
106 #define spd_open _spd_open
107 #define spd_close _spd_close
108 #define spd_list_synthesis_voices _spd_list_synthesis_voices
109 #define spd_say _spd_say
110 #define spd_cancel _spd_cancel
111 #define spd_set_volume _spd_set_volume
112 #define spd_set_voice_rate _spd_set_voice_rate
113 #define spd_set_voice_pitch _spd_set_voice_pitch
114 #define spd_set_synthesis_voice _spd_set_synthesis_voice
115 #define spd_set_notification_on _spd_set_notification_on
116 
117 static PRLibrary* speechdLib = nullptr;
118 
119 typedef void (*nsSpeechDispatcherFunc)();
120 struct nsSpeechDispatcherDynamicFunction {
121   const char* functionName;
122   nsSpeechDispatcherFunc* function;
123 };
124 
125 namespace mozilla::dom {
126 
127 StaticRefPtr<SpeechDispatcherService> SpeechDispatcherService::sSingleton;
128 
129 class SpeechDispatcherVoice {
130  public:
SpeechDispatcherVoice(const nsAString & aName,const nsAString & aLanguage)131   SpeechDispatcherVoice(const nsAString& aName, const nsAString& aLanguage)
132       : mName(aName), mLanguage(aLanguage) {}
133 
134   NS_INLINE_DECL_THREADSAFE_REFCOUNTING(SpeechDispatcherVoice)
135 
136   // Voice name
137   nsString mName;
138 
139   // Voice language, in BCP-47 syntax
140   nsString mLanguage;
141 
142  private:
143   ~SpeechDispatcherVoice() = default;
144 };
145 
146 class SpeechDispatcherCallback final : public nsISpeechTaskCallback {
147  public:
SpeechDispatcherCallback(nsISpeechTask * aTask,SpeechDispatcherService * aService)148   SpeechDispatcherCallback(nsISpeechTask* aTask,
149                            SpeechDispatcherService* aService)
150       : mTask(aTask), mService(aService) {}
151 
152   NS_DECL_CYCLE_COLLECTING_ISUPPORTS
153   NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(SpeechDispatcherCallback,
154                                            nsISpeechTaskCallback)
155 
156   NS_DECL_NSISPEECHTASKCALLBACK
157 
158   bool OnSpeechEvent(SPDNotificationType state);
159 
160  private:
161   ~SpeechDispatcherCallback() = default;
162 
163   // This pointer is used to dispatch events
164   nsCOMPtr<nsISpeechTask> mTask;
165 
166   // By holding a strong reference to the service we guarantee that it won't be
167   // destroyed before this runnable.
168   RefPtr<SpeechDispatcherService> mService;
169 
170   TimeStamp mStartTime;
171 };
172 
173 NS_IMPL_CYCLE_COLLECTION(SpeechDispatcherCallback, mTask);
174 
175 NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechDispatcherCallback)
NS_INTERFACE_MAP_ENTRY(nsISpeechTaskCallback)176   NS_INTERFACE_MAP_ENTRY(nsISpeechTaskCallback)
177   NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechTaskCallback)
178 NS_INTERFACE_MAP_END
179 
180 NS_IMPL_CYCLE_COLLECTING_ADDREF(SpeechDispatcherCallback)
181 NS_IMPL_CYCLE_COLLECTING_RELEASE(SpeechDispatcherCallback)
182 
183 NS_IMETHODIMP
184 SpeechDispatcherCallback::OnPause() {
185   // XXX: Speech dispatcher does not pause immediately, but waits for the speech
186   // to reach an index mark so that it could resume from that offset.
187   // There is no support for word or sentence boundaries, so index marks would
188   // only occur in explicit SSML marks, and we don't support that yet.
189   // What in actuality happens, is that if you call spd_pause(), it will speak
190   // the utterance in its entirety, dispatch an end event, and then put speechd
191   // in a 'paused' state. Since it is after the utterance ended, we don't get
192   // that state change, and our speech api is in an unrecoverable state.
193   // So, since it is useless anyway, I am not implementing pause.
194   return NS_OK;
195 }
196 
197 NS_IMETHODIMP
OnResume()198 SpeechDispatcherCallback::OnResume() {
199   // XXX: Unsupported, see OnPause().
200   return NS_OK;
201 }
202 
203 NS_IMETHODIMP
OnCancel()204 SpeechDispatcherCallback::OnCancel() {
205   if (spd_cancel(mService->mSpeechdClient) < 0) {
206     return NS_ERROR_FAILURE;
207   }
208 
209   return NS_OK;
210 }
211 
212 NS_IMETHODIMP
OnVolumeChanged(float aVolume)213 SpeechDispatcherCallback::OnVolumeChanged(float aVolume) {
214   // XXX: This currently does not change the volume mid-utterance, but it
215   // doesn't do anything bad either. So we could put this here with the hopes
216   // that speechd supports this in the future.
217   if (spd_set_volume(mService->mSpeechdClient,
218                      static_cast<int>(aVolume * 100)) < 0) {
219     return NS_ERROR_FAILURE;
220   }
221 
222   return NS_OK;
223 }
224 
OnSpeechEvent(SPDNotificationType state)225 bool SpeechDispatcherCallback::OnSpeechEvent(SPDNotificationType state) {
226   bool remove = false;
227 
228   switch (state) {
229     case SPD_EVENT_BEGIN:
230       mStartTime = TimeStamp::Now();
231       mTask->DispatchStart();
232       break;
233 
234     case SPD_EVENT_PAUSE:
235       mTask->DispatchPause((TimeStamp::Now() - mStartTime).ToSeconds(), 0);
236       break;
237 
238     case SPD_EVENT_RESUME:
239       mTask->DispatchResume((TimeStamp::Now() - mStartTime).ToSeconds(), 0);
240       break;
241 
242     case SPD_EVENT_CANCEL:
243     case SPD_EVENT_END:
244       mTask->DispatchEnd((TimeStamp::Now() - mStartTime).ToSeconds(), 0);
245       remove = true;
246       break;
247 
248     case SPD_EVENT_INDEX_MARK:
249       // Not yet supported
250       break;
251 
252     default:
253       break;
254   }
255 
256   return remove;
257 }
258 
speechd_cb(size_t msg_id,size_t client_id,SPDNotificationType state)259 static void speechd_cb(size_t msg_id, size_t client_id,
260                        SPDNotificationType state) {
261   SpeechDispatcherService* service =
262       SpeechDispatcherService::GetInstance(false);
263 
264   if (service) {
265     NS_DispatchToMainThread(NewRunnableMethod<uint32_t, SPDNotificationType>(
266         "dom::SpeechDispatcherService::EventNotify", service,
267         &SpeechDispatcherService::EventNotify, static_cast<uint32_t>(msg_id),
268         state));
269   }
270 }
271 
272 NS_INTERFACE_MAP_BEGIN(SpeechDispatcherService)
NS_INTERFACE_MAP_ENTRY(nsISpeechService)273   NS_INTERFACE_MAP_ENTRY(nsISpeechService)
274   NS_INTERFACE_MAP_ENTRY(nsIObserver)
275   NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsIObserver)
276 NS_INTERFACE_MAP_END
277 
278 NS_IMPL_ADDREF(SpeechDispatcherService)
279 NS_IMPL_RELEASE(SpeechDispatcherService)
280 
281 SpeechDispatcherService::SpeechDispatcherService()
282     : mInitialized(false), mSpeechdClient(nullptr) {}
283 
Init()284 void SpeechDispatcherService::Init() {
285   if (!StaticPrefs::media_webspeech_synth_enabled() ||
286       Preferences::GetBool("media.webspeech.synth.test")) {
287     return;
288   }
289 
290   // While speech dispatcher has a "threaded" mode, only spd_say() is async.
291   // Since synchronous socket i/o could impact startup time, we do
292   // initialization in a separate thread.
293   DebugOnly<nsresult> rv =
294       NS_NewNamedThread("speechd init", getter_AddRefs(mInitThread));
295   MOZ_ASSERT(NS_SUCCEEDED(rv));
296   rv = mInitThread->Dispatch(
297       NewRunnableMethod("dom::SpeechDispatcherService::Setup", this,
298                         &SpeechDispatcherService::Setup),
299       NS_DISPATCH_NORMAL);
300   MOZ_ASSERT(NS_SUCCEEDED(rv));
301 }
302 
~SpeechDispatcherService()303 SpeechDispatcherService::~SpeechDispatcherService() {
304   if (mInitThread) {
305     mInitThread->Shutdown();
306   }
307 
308   if (mSpeechdClient) {
309     spd_close(mSpeechdClient);
310   }
311 }
312 
Setup()313 void SpeechDispatcherService::Setup() {
314 #define FUNC(name, type, params) {#name, (nsSpeechDispatcherFunc*)&_##name},
315   static const nsSpeechDispatcherDynamicFunction kSpeechDispatcherSymbols[] = {
316       SPEECHD_FUNCTIONS};
317 #undef FUNC
318 
319   MOZ_ASSERT(!mInitialized);
320 
321   speechdLib = PR_LoadLibrary("libspeechd.so.2");
322 
323   if (!speechdLib) {
324     NS_WARNING("Failed to load speechd library");
325     return;
326   }
327 
328   if (!PR_FindFunctionSymbol(speechdLib, "spd_get_volume")) {
329     // There is no version getter function, so we rely on a symbol that was
330     // introduced in release 0.8.2 in order to check for ABI compatibility.
331     NS_WARNING("Unsupported version of speechd detected");
332     return;
333   }
334 
335   for (uint32_t i = 0; i < ArrayLength(kSpeechDispatcherSymbols); i++) {
336     *kSpeechDispatcherSymbols[i].function = PR_FindFunctionSymbol(
337         speechdLib, kSpeechDispatcherSymbols[i].functionName);
338 
339     if (!*kSpeechDispatcherSymbols[i].function) {
340       NS_WARNING(nsPrintfCString("Failed to find speechd symbol for'%s'",
341                                  kSpeechDispatcherSymbols[i].functionName)
342                      .get());
343       return;
344     }
345   }
346 
347   mSpeechdClient =
348       spd_open("firefox", "web speech api", "who", SPD_MODE_THREADED);
349   if (!mSpeechdClient) {
350     NS_WARNING("Failed to call spd_open");
351     return;
352   }
353 
354   // Get all the voices from sapi and register in the SynthVoiceRegistry
355   SPDVoice** list = spd_list_synthesis_voices(mSpeechdClient);
356 
357   mSpeechdClient->callback_begin = speechd_cb;
358   mSpeechdClient->callback_end = speechd_cb;
359   mSpeechdClient->callback_cancel = speechd_cb;
360   mSpeechdClient->callback_pause = speechd_cb;
361   mSpeechdClient->callback_resume = speechd_cb;
362 
363   spd_set_notification_on(mSpeechdClient, SPD_BEGIN);
364   spd_set_notification_on(mSpeechdClient, SPD_END);
365   spd_set_notification_on(mSpeechdClient, SPD_CANCEL);
366 
367   if (list != NULL) {
368     for (int i = 0; list[i]; i++) {
369       nsAutoString uri;
370 
371       uri.AssignLiteral(URI_PREFIX);
372       nsAutoCString name;
373       NS_EscapeURL(list[i]->name, -1,
374                    esc_OnlyNonASCII | esc_Spaces | esc_AlwaysCopy, name);
375       uri.Append(NS_ConvertUTF8toUTF16(name));
376 
377       uri.AppendLiteral("?");
378 
379       nsAutoCString lang(list[i]->language);
380 
381       uri.Append(NS_ConvertUTF8toUTF16(lang));
382 
383       mVoices.InsertOrUpdate(uri, MakeRefPtr<SpeechDispatcherVoice>(
384                                       NS_ConvertUTF8toUTF16(list[i]->name),
385                                       NS_ConvertUTF8toUTF16(lang)));
386     }
387   }
388 
389   NS_DispatchToMainThread(
390       NewRunnableMethod("dom::SpeechDispatcherService::RegisterVoices", this,
391                         &SpeechDispatcherService::RegisterVoices));
392 
393   // mInitialized = true;
394 }
395 
396 // private methods
397 
RegisterVoices()398 void SpeechDispatcherService::RegisterVoices() {
399   RefPtr<nsSynthVoiceRegistry> registry = nsSynthVoiceRegistry::GetInstance();
400   for (const auto& entry : mVoices) {
401     const RefPtr<SpeechDispatcherVoice>& voice = entry.GetData();
402 
403     // This service can only speak one utterance at a time, so we set
404     // aQueuesUtterances to true in order to track global state and schedule
405     // access to this service.
406     DebugOnly<nsresult> rv =
407         registry->AddVoice(this, entry.GetKey(), voice->mName, voice->mLanguage,
408                            voice->mName.EqualsLiteral("default"), true);
409 
410     NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), "Failed to add voice");
411   }
412 
413   mInitThread->Shutdown();
414   mInitThread = nullptr;
415 
416   mInitialized = true;
417 
418   registry->NotifyVoicesChanged();
419 }
420 
421 // nsIObserver
422 
423 NS_IMETHODIMP
Observe(nsISupports * aSubject,const char * aTopic,const char16_t * aData)424 SpeechDispatcherService::Observe(nsISupports* aSubject, const char* aTopic,
425                                  const char16_t* aData) {
426   return NS_OK;
427 }
428 
429 // nsISpeechService
430 
431 // TODO: Support SSML
432 NS_IMETHODIMP
Speak(const nsAString & aText,const nsAString & aUri,float aVolume,float aRate,float aPitch,nsISpeechTask * aTask)433 SpeechDispatcherService::Speak(const nsAString& aText, const nsAString& aUri,
434                                float aVolume, float aRate, float aPitch,
435                                nsISpeechTask* aTask) {
436   if (NS_WARN_IF(!mInitialized)) {
437     return NS_ERROR_NOT_AVAILABLE;
438   }
439 
440   RefPtr<SpeechDispatcherCallback> callback =
441       new SpeechDispatcherCallback(aTask, this);
442 
443   bool found = false;
444   SpeechDispatcherVoice* voice = mVoices.GetWeak(aUri, &found);
445 
446   if (NS_WARN_IF(!(found))) {
447     return NS_ERROR_NOT_AVAILABLE;
448   }
449 
450   spd_set_synthesis_voice(mSpeechdClient,
451                           NS_ConvertUTF16toUTF8(voice->mName).get());
452 
453   // We provide a volume of 0.0 to 1.0, speech-dispatcher expects 0 - 100.
454   spd_set_volume(mSpeechdClient, static_cast<int>(aVolume * 100));
455 
456   // aRate is a value of 0.1 (0.1x) to 10 (10x) with 1 (1x) being normal rate.
457   // speechd expects -100 to 100 with 0 being normal rate.
458   float rate = 0;
459   if (aRate > 1) {
460     // Each step to 100 is logarithmically distributed up to 2.5x.
461     rate = log10(std::min(aRate, MAX_RATE)) / log10(MAX_RATE) * 100;
462   } else if (aRate < 1) {
463     // Each step to -100 is logarithmically distributed down to 0.5x.
464     rate = log10(std::max(aRate, MIN_RATE)) / log10(MIN_RATE) * -100;
465   }
466 
467   spd_set_voice_rate(mSpeechdClient, static_cast<int>(rate));
468 
469   // We provide a pitch of 0 to 2 with 1 being the default.
470   // speech-dispatcher expects -100 to 100 with 0 being default.
471   spd_set_voice_pitch(mSpeechdClient, static_cast<int>((aPitch - 1) * 100));
472 
473   nsresult rv = aTask->Setup(callback);
474 
475   if (NS_FAILED(rv)) {
476     return rv;
477   }
478 
479   if (aText.Length()) {
480     int msg_id = spd_say(mSpeechdClient, SPD_MESSAGE,
481                          NS_ConvertUTF16toUTF8(aText).get());
482 
483     if (msg_id < 0) {
484       return NS_ERROR_FAILURE;
485     }
486 
487     mCallbacks.InsertOrUpdate(msg_id, std::move(callback));
488   } else {
489     // Speech dispatcher does not work well with empty strings.
490     // In that case, don't send empty string to speechd,
491     // and just emulate a speechd start and end event.
492     NS_DispatchToMainThread(NewRunnableMethod<SPDNotificationType>(
493         "dom::SpeechDispatcherCallback::OnSpeechEvent", callback,
494         &SpeechDispatcherCallback::OnSpeechEvent, SPD_EVENT_BEGIN));
495 
496     NS_DispatchToMainThread(NewRunnableMethod<SPDNotificationType>(
497         "dom::SpeechDispatcherCallback::OnSpeechEvent", callback,
498         &SpeechDispatcherCallback::OnSpeechEvent, SPD_EVENT_END));
499   }
500 
501   return NS_OK;
502 }
503 
GetInstance(bool create)504 SpeechDispatcherService* SpeechDispatcherService::GetInstance(bool create) {
505   if (XRE_GetProcessType() != GeckoProcessType_Default) {
506     MOZ_ASSERT(
507         false,
508         "SpeechDispatcherService can only be started on main gecko process");
509     return nullptr;
510   }
511 
512   if (!sSingleton && create) {
513     sSingleton = new SpeechDispatcherService();
514     sSingleton->Init();
515     ClearOnShutdown(&sSingleton);
516   }
517 
518   return sSingleton;
519 }
520 
521 already_AddRefed<SpeechDispatcherService>
GetInstanceForService()522 SpeechDispatcherService::GetInstanceForService() {
523   MOZ_ASSERT(NS_IsMainThread());
524   RefPtr<SpeechDispatcherService> sapiService = GetInstance();
525   return sapiService.forget();
526 }
527 
EventNotify(uint32_t aMsgId,uint32_t aState)528 void SpeechDispatcherService::EventNotify(uint32_t aMsgId, uint32_t aState) {
529   SpeechDispatcherCallback* callback = mCallbacks.GetWeak(aMsgId);
530 
531   if (callback) {
532     if (callback->OnSpeechEvent((SPDNotificationType)aState)) {
533       mCallbacks.Remove(aMsgId);
534     }
535   }
536 }
537 
538 }  // namespace mozilla::dom
539