// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include #include #include #include #include "base/bind.h" #include "base/command_line.h" #include "base/debug/leak_annotations.h" #include "base/macros.h" #include "base/no_destructor.h" #include "base/sequenced_task_runner.h" #include "base/synchronization/lock.h" #include "base/task/task_traits.h" #include "base/task/thread_pool.h" #include "base/task_runner.h" #include "base/threading/sequence_bound.h" #include "content/browser/speech/tts_platform_impl.h" #include "content/public/browser/browser_task_traits.h" #include "content/public/browser/browser_thread.h" #include "content/public/browser/tts_controller.h" #include "content/public/common/content_switches.h" #include "library_loaders/libspeechd.h" namespace content { namespace { struct SPDChromeVoice { std::string name; std::string module; std::string language; }; using PlatformVoices = std::map; constexpr int kInvalidUtteranceId = -1; constexpr int kInvalidMessageUid = -1; } // namespace class TtsPlatformImplBackgroundWorker { public: TtsPlatformImplBackgroundWorker() = default; TtsPlatformImplBackgroundWorker(const TtsPlatformImplBackgroundWorker&) = delete; TtsPlatformImplBackgroundWorker& operator=( const TtsPlatformImplBackgroundWorker&) = delete; ~TtsPlatformImplBackgroundWorker() = default; void Initialize(); void ProcessSpeech(int utterance_id, const std::string& parsed_utterance, const std::string& lang, float rate, float pitch, SPDChromeVoice voice, base::OnceCallback on_speak_finished); void Pause(); void Resume(); void StopSpeaking(); void Shutdown(); private: bool InitializeSpeechd(); void InitializeVoices(PlatformVoices*); void OpenConnection(); void CloseConnection(); void OnSpeechEvent(int msg_id, SPDNotificationType type); // Send an TTS event notification to the TTS controller. void SendTtsEvent(int utterance_id, TtsEventType event_type, int char_index, int length = -1); static void NotificationCallback(size_t msg_id, size_t client_id, SPDNotificationType type); static void IndexMarkCallback(size_t msg_id, size_t client_id, SPDNotificationType state, char* index_mark); LibSpeechdLoader libspeechd_loader_; SPDConnection* conn_ = nullptr; int msg_uid_ = kInvalidMessageUid; // These apply to the current utterance only that is currently being // processed. int utterance_id_ = kInvalidUtteranceId; size_t utterance_length_ = 0; size_t utterance_char_position_ = 0; }; class TtsPlatformImplLinux : public TtsPlatformImpl { public: TtsPlatformImplLinux(const TtsPlatformImplLinux&) = delete; TtsPlatformImplLinux& operator=(const TtsPlatformImplLinux&) = delete; bool PlatformImplSupported() override; bool PlatformImplInitialized() override; void Speak(int utterance_id, const std::string& utterance, const std::string& lang, const VoiceData& voice, const UtteranceContinuousParameters& params, base::OnceCallback on_speak_finished) override; bool StopSpeaking() override; void Pause() override; void Resume() override; bool IsSpeaking() override; void GetVoices(std::vector* out_voices) override; void Shutdown() override; void OnInitialized(bool success, PlatformVoices voices); void OnSpeakScheduled(base::OnceCallback on_speak_finished, bool success); void OnSpeakFinished(int utterance_id); base::SequenceBound* worker() { return &worker_; } // Get the single instance of this class. static TtsPlatformImplLinux* GetInstance(); private: friend base::NoDestructor; TtsPlatformImplLinux(); void ProcessSpeech(int utterance_id, const std::string& lang, const VoiceData& voice, const UtteranceContinuousParameters& params, base::OnceCallback on_speak_finished, const std::string& parsed_utterance); // Holds the platform state. bool is_supported_ = false; bool is_initialized_ = false; bool is_speaking_ = false; bool paused_ = false; // The current utterance being spoke. int utterance_id_ = kInvalidUtteranceId; // Map a string composed of a voicename and module to the voicename. Used to // uniquely identify a voice across all available modules. PlatformVoices voices_; // Hold the state and the code of the background implementation. base::SequenceBound worker_; }; // // TtsPlatformImplBackgroundWorker // void TtsPlatformImplBackgroundWorker::Initialize() { PlatformVoices voices; if (InitializeSpeechd()) { OpenConnection(); InitializeVoices(&voices); } bool success = (conn_ != nullptr); GetUIThreadTaskRunner({})->PostTask( FROM_HERE, base::BindOnce(&TtsPlatformImplLinux::OnInitialized, base::Unretained(TtsPlatformImplLinux::GetInstance()), success, std::move(voices))); } void TtsPlatformImplBackgroundWorker::ProcessSpeech( int utterance_id, const std::string& parsed_utterance, const std::string& lang, float rate, float pitch, SPDChromeVoice voice, base::OnceCallback on_speak_finished) { libspeechd_loader_.spd_set_output_module(conn_, voice.module.c_str()); libspeechd_loader_.spd_set_synthesis_voice(conn_, voice.name.c_str()); // Map our multiplicative range to Speech Dispatcher's linear range. // .334 = -100. // 3 = 100. libspeechd_loader_.spd_set_voice_rate(conn_, 100 * log10(rate) / log10(3)); libspeechd_loader_.spd_set_voice_pitch(conn_, 100 * log10(pitch) / log10(3)); // Support languages other than the default if (!lang.empty()) libspeechd_loader_.spd_set_language(conn_, lang.c_str()); utterance_id_ = utterance_id; utterance_char_position_ = 0; utterance_length_ = parsed_utterance.size(); // spd_say(...) returns msg_uid on success, -1 otherwise. Each call to spd_say // returns a different msg_uid. msg_uid_ = libspeechd_loader_.spd_say(conn_, SPD_TEXT, parsed_utterance.c_str()); bool success = (msg_uid_ != kInvalidMessageUid); GetUIThreadTaskRunner({})->PostTask( FROM_HERE, base::BindOnce(std::move(on_speak_finished), success)); } void TtsPlatformImplBackgroundWorker::Pause() { if (msg_uid_ != kInvalidMessageUid) libspeechd_loader_.spd_pause(conn_); } void TtsPlatformImplBackgroundWorker::Resume() { if (msg_uid_ != kInvalidMessageUid) libspeechd_loader_.spd_resume(conn_); } void TtsPlatformImplBackgroundWorker::StopSpeaking() { if (msg_uid_ != kInvalidMessageUid) { int result = libspeechd_loader_.spd_stop(conn_); if (result == -1) { CloseConnection(); OpenConnection(); } msg_uid_ = kInvalidMessageUid; utterance_id_ = kInvalidUtteranceId; } } void TtsPlatformImplBackgroundWorker::Shutdown() { CloseConnection(); } bool TtsPlatformImplBackgroundWorker::InitializeSpeechd() { return libspeechd_loader_.Load("libspeechd.so.2"); } void TtsPlatformImplBackgroundWorker::InitializeVoices(PlatformVoices* voices) { if (!conn_) return; char** modules = libspeechd_loader_.spd_list_modules(conn_); if (!modules) return; for (int i = 0; modules[i]; i++) { char* module = modules[i]; libspeechd_loader_.spd_set_output_module(conn_, module); SPDVoice** spd_voices = libspeechd_loader_.spd_list_synthesis_voices(conn_); if (!spd_voices) { free(module); continue; } for (int j = 0; spd_voices[j]; j++) { SPDVoice* spd_voice = spd_voices[j]; SPDChromeVoice spd_data; spd_data.name = spd_voice->name; spd_data.module = module; spd_data.language = spd_voice->language; std::string key; key.append(spd_data.name); key.append(" "); key.append(spd_data.module); voices->insert(std::pair(key, spd_data)); free(spd_voices[j]); } free(modules[i]); } } void TtsPlatformImplBackgroundWorker::OpenConnection() { { // spd_open has memory leaks which are hard to suppress. // http://crbug.com/317360 ANNOTATE_SCOPED_MEMORY_LEAK; conn_ = libspeechd_loader_.spd_open("chrome", "extension_api", nullptr, SPD_MODE_THREADED); } if (!conn_) return; // Register callbacks for all events. conn_->callback_begin = conn_->callback_end = conn_->callback_cancel = conn_->callback_pause = conn_->callback_resume = &TtsPlatformImplBackgroundWorker::NotificationCallback; conn_->callback_im = &TtsPlatformImplBackgroundWorker::IndexMarkCallback; libspeechd_loader_.spd_set_notification_on(conn_, SPD_BEGIN); libspeechd_loader_.spd_set_notification_on(conn_, SPD_END); libspeechd_loader_.spd_set_notification_on(conn_, SPD_CANCEL); libspeechd_loader_.spd_set_notification_on(conn_, SPD_PAUSE); libspeechd_loader_.spd_set_notification_on(conn_, SPD_RESUME); } void TtsPlatformImplBackgroundWorker::CloseConnection() { if (conn_) { libspeechd_loader_.spd_close(conn_); conn_ = nullptr; } } void TtsPlatformImplBackgroundWorker::OnSpeechEvent(int msg_id, SPDNotificationType type) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); if (msg_id != msg_uid_) return; switch (type) { case SPD_EVENT_BEGIN: utterance_char_position_ = 0; SendTtsEvent(utterance_id_, TTS_EVENT_START, utterance_char_position_, -1); break; case SPD_EVENT_RESUME: SendTtsEvent(utterance_id_, TTS_EVENT_RESUME, utterance_char_position_, -1); break; case SPD_EVENT_END: GetUIThreadTaskRunner({})->PostTask( FROM_HERE, base::BindOnce(&TtsPlatformImplLinux::OnSpeakFinished, base::Unretained(TtsPlatformImplLinux::GetInstance()), utterance_id_)); utterance_char_position_ = utterance_length_; SendTtsEvent(utterance_id_, TTS_EVENT_END, utterance_char_position_, 0); break; case SPD_EVENT_PAUSE: SendTtsEvent(utterance_id_, TTS_EVENT_PAUSE, utterance_char_position_, -1); break; case SPD_EVENT_CANCEL: SendTtsEvent(utterance_id_, TTS_EVENT_CANCELLED, utterance_char_position_, -1); break; case SPD_EVENT_INDEX_MARK: // TODO: Can we get length from linux? If so, update // utterance_char_position_. SendTtsEvent(utterance_id_, TTS_EVENT_MARKER, utterance_char_position_, -1); break; } } void TtsPlatformImplBackgroundWorker::SendTtsEvent(int utterance_id, TtsEventType event_type, int char_index, int length) { GetUIThreadTaskRunner({})->PostTask( FROM_HERE, base::BindOnce(&TtsController::OnTtsEvent, base::Unretained(TtsController::GetInstance()), utterance_id, event_type, char_index, length, std::string())); } // static void TtsPlatformImplBackgroundWorker::NotificationCallback( size_t msg_id, size_t client_id, SPDNotificationType type) { TtsPlatformImplLinux::GetInstance()->worker()->Post( FROM_HERE, &TtsPlatformImplBackgroundWorker::OnSpeechEvent, msg_id, type); } // static void TtsPlatformImplBackgroundWorker::IndexMarkCallback( size_t msg_id, size_t client_id, SPDNotificationType type, char* index_mark) { // TODO(dtseng): index_mark appears to specify an index type supplied by a // client. Need to explore how this is used before hooking it up with existing // word, sentence events. TtsPlatformImplLinux::GetInstance()->worker()->Post( FROM_HERE, &TtsPlatformImplBackgroundWorker::OnSpeechEvent, msg_id, type); } // // TtsPlatformImplLinux // TtsPlatformImplLinux::TtsPlatformImplLinux() : worker_(base::ThreadPool::CreateSequencedTaskRunner({base::MayBlock()})) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); const base::CommandLine& command_line = *base::CommandLine::ForCurrentProcess(); if (!command_line.HasSwitch(switches::kEnableSpeechDispatcher)) return; // The TTS platform is supported. The Tts platform initialisation will happen // on a worker thread and it will become initialized. is_supported_ = true; worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::Initialize); } bool TtsPlatformImplLinux::PlatformImplSupported() { return is_supported_; } bool TtsPlatformImplLinux::PlatformImplInitialized() { return is_initialized_; } void TtsPlatformImplLinux::Speak( int utterance_id, const std::string& utterance, const std::string& lang, const VoiceData& voice, const UtteranceContinuousParameters& params, base::OnceCallback on_speak_finished) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(PlatformImplInitialized()); if (paused_ || is_speaking_) { std::move(on_speak_finished).Run(false); return; } // Flag that a utterance is getting emitted. The |is_speaking_| flag will be // set back to false when the utterance will be fully spoken, stopped or if // the voice synthetizer was not able to emit it. is_speaking_ = true; utterance_id_ = utterance_id; // Parse SSML and process speech. TtsController::GetInstance()->StripSSML( utterance, base::BindOnce(&TtsPlatformImplLinux::ProcessSpeech, base::Unretained(this), utterance_id, lang, voice, params, base::BindOnce(&TtsPlatformImplLinux::OnSpeakScheduled, base::Unretained(this), std::move(on_speak_finished)))); } bool TtsPlatformImplLinux::StopSpeaking() { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(PlatformImplInitialized()); worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::StopSpeaking); paused_ = false; is_speaking_ = false; utterance_id_ = kInvalidUtteranceId; return true; } void TtsPlatformImplLinux::Pause() { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(PlatformImplInitialized()); if (paused_ || !is_speaking_) return; worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::Pause); paused_ = true; } void TtsPlatformImplLinux::Resume() { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(PlatformImplInitialized()); if (!paused_ || !is_speaking_) return; worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::Resume); paused_ = false; } bool TtsPlatformImplLinux::IsSpeaking() { return is_speaking_; } void TtsPlatformImplLinux::GetVoices(std::vector* out_voices) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(PlatformImplInitialized()); for (auto it = voices_.begin(); it != voices_.end(); ++it) { out_voices->push_back(VoiceData()); VoiceData& voice = out_voices->back(); voice.native = true; voice.name = it->first; voice.lang = it->second.language; voice.events.insert(TTS_EVENT_START); voice.events.insert(TTS_EVENT_END); voice.events.insert(TTS_EVENT_CANCELLED); voice.events.insert(TTS_EVENT_MARKER); voice.events.insert(TTS_EVENT_PAUSE); voice.events.insert(TTS_EVENT_RESUME); } } void TtsPlatformImplLinux::Shutdown() { worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::Shutdown); } void TtsPlatformImplLinux::OnInitialized(bool success, PlatformVoices voices) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); if (success) voices_ = std::move(voices); is_initialized_ = true; TtsController::GetInstance()->VoicesChanged(); } void TtsPlatformImplLinux::OnSpeakScheduled( base::OnceCallback on_speak_finished, bool success) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(is_speaking_); // If the utterance was not able to be emitted, stop the speaking. There // won't be any asynchronous TTS event to confirm the end of the speech. if (!success) { is_speaking_ = false; utterance_id_ = kInvalidUtteranceId; } // Pass the results to our caller. std::move(on_speak_finished).Run(success); } void TtsPlatformImplLinux::OnSpeakFinished(int utterance_id) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); if (utterance_id != utterance_id_) return; DCHECK(is_speaking_); DCHECK_NE(utterance_id_, kInvalidUtteranceId); is_speaking_ = false; utterance_id_ = kInvalidUtteranceId; } void TtsPlatformImplLinux::ProcessSpeech( int utterance_id, const std::string& lang, const VoiceData& voice, const UtteranceContinuousParameters& params, base::OnceCallback on_speak_finished, const std::string& parsed_utterance) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); // Speech dispatcher's speech params are around 3x at either limit. float rate = params.rate > 3 ? 3 : params.rate; rate = params.rate < 0.334 ? 0.334 : rate; float pitch = params.pitch > 3 ? 3 : params.pitch; pitch = params.pitch < 0.334 ? 0.334 : pitch; SPDChromeVoice matched_voice; auto it = voices_.find(voice.name); if (it != voices_.end()) matched_voice = it->second; worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::ProcessSpeech, utterance_id, parsed_utterance, lang, rate, pitch, matched_voice, std::move(on_speak_finished)); } // static TtsPlatformImplLinux* TtsPlatformImplLinux::GetInstance() { static base::NoDestructor tts_platform; return tts_platform.get(); } // static TtsPlatformImpl* TtsPlatformImpl::GetInstance() { return TtsPlatformImplLinux::GetInstance(); } } // namespace content