// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include #include #include #include #include #include #include #include "base/bind.h" #include "base/macros.h" #include "base/no_destructor.h" #include "base/sequenced_task_runner.h" #include "base/strings/string_number_conversions.h" #include "base/strings/string_piece.h" #include "base/strings/utf_string_conversions.h" #include "base/synchronization/lock.h" #include "base/task/task_traits.h" #include "base/task/thread_pool.h" #include "base/task_runner.h" #include "base/thread_annotations.h" #include "base/threading/sequence_bound.h" #include "base/values.h" #include "base/win/scoped_co_mem.h" #include "base/win/sphelper.h" #include "content/browser/speech/tts_platform_impl.h" #include "content/public/browser/browser_task_traits.h" #include "content/public/browser/browser_thread.h" #include "content/public/browser/tts_controller.h" namespace content { namespace { class TtsPlatformImplWin; class TtsPlatformImplBackgroundWorker; constexpr int kInvalidUtteranceId = -1; // ISpObjectToken key and value names. const wchar_t kAttributesKey[] = L"Attributes"; const wchar_t kLanguageValue[] = L"Language"; // This COM interface is receiving the TTS events on the ISpVoice asynchronous // worker thread and is emitting a notification task // TtsPlatformImplBackgroundWorker::SendTtsEvent(...) on the worker sequence. class TtsEventSink : public Microsoft::WRL::RuntimeClass< Microsoft::WRL::RuntimeClassFlags, ISpNotifySink> { public: TtsEventSink(TtsPlatformImplBackgroundWorker* worker, scoped_refptr worker_task_runner) : worker_(worker), worker_task_runner_(std::move(worker_task_runner)) {} // ISpNotifySink: IFACEMETHODIMP Notify(void) override; int GetUtteranceId() { base::AutoLock lock(lock_); return utterance_id_; } void SetUtteranceId(int utterance_id) { base::AutoLock lock(lock_); utterance_id_ = utterance_id; } private: // |worker_| is leaky and must never deleted because TtsEventSink posts // asynchronous tasks to it. TtsPlatformImplBackgroundWorker* worker_; scoped_refptr worker_task_runner_; base::Lock lock_; int utterance_id_ GUARDED_BY(lock_); }; class TtsPlatformImplBackgroundWorker { public: explicit TtsPlatformImplBackgroundWorker( scoped_refptr task_runner) : tts_event_sink_( Microsoft::WRL::Make(this, std::move(task_runner))) {} TtsPlatformImplBackgroundWorker(const TtsPlatformImplBackgroundWorker&) = delete; TtsPlatformImplBackgroundWorker& operator=( const TtsPlatformImplBackgroundWorker&) = delete; ~TtsPlatformImplBackgroundWorker() = default; void Initialize(); void ProcessSpeech(int utterance_id, const std::string& lang, const VoiceData& voice, const UtteranceContinuousParameters& params, base::OnceCallback on_speak_finished, const std::string& parsed_utterance); void StopSpeaking(bool paused); void Pause(); void Resume(); void Shutdown(); // This function is called after being notified by the speech synthetizer that // there are TTS notifications are available and should be they should be // processed. void OnSpeechEvent(int utterance_id); // Send an TTS event notification to the TTS controller. void SendTtsEvent(int utterance_id, TtsEventType event_type, int char_index, int length = -1); private: void GetVoices(std::vector* voices); void SetVoiceFromName(const std::string& name); // These apply to the current utterance only that is currently being processed // on the worker thread. TTS events are dispatched by TtsEventSink to this // class and update the current speaking state of the utterance. std::string last_voice_name_; ULONG stream_number_ = 0u; int utterance_id_ = kInvalidUtteranceId; size_t utterance_char_position_ = 0u; size_t utterance_prefix_length_ = 0u; size_t utterance_length_ = 0u; // The COM class ISpVoice lives within the COM MTA apartment (worker pool). // This interface can not be called on the UI thread since UI thread is // COM STA. Microsoft::WRL::ComPtr speech_synthesizer_; Microsoft::WRL::ComPtr tts_event_sink_; }; class TtsPlatformImplWin : public TtsPlatformImpl { public: TtsPlatformImplWin(const TtsPlatformImplWin&) = delete; TtsPlatformImplWin& operator=(const TtsPlatformImplWin&) = delete; bool PlatformImplSupported() override { return true; } bool PlatformImplInitialized() override; void Speak(int utterance_id, const std::string& utterance, const std::string& lang, const VoiceData& voice, const UtteranceContinuousParameters& params, base::OnceCallback on_speak_finished) override; bool StopSpeaking() override; void Pause() override; void Resume() override; bool IsSpeaking() override; void GetVoices(std::vector* out_voices) override; void Shutdown() override; void OnInitializeComplete(bool success, std::vector voices); void OnSpeakScheduled(base::OnceCallback on_speak_finished, bool success); void OnSpeakFinished(int utterance_id); // Get the single instance of this class. static TtsPlatformImplWin* GetInstance(); private: friend base::NoDestructor; TtsPlatformImplWin(); void ProcessSpeech(int utterance_id, const std::string& lang, const VoiceData& voice, const UtteranceContinuousParameters& params, base::OnceCallback on_speak_finished, const std::string& parsed_utterance); void FinishCurrentUtterance(); // These variables hold the platform state. bool paused_ = false; bool is_speaking_ = false; int utterance_id_ = kInvalidUtteranceId; bool platform_initialized_ = false; std::vector voices_; // Hold the state and the code of the background implementation. scoped_refptr worker_task_runner_; base::SequenceBound worker_; }; HRESULT TtsEventSink::Notify() { worker_task_runner_->PostTask( FROM_HERE, base::BindOnce(&TtsPlatformImplBackgroundWorker::OnSpeechEvent, base::Unretained(worker_), GetUtteranceId())); return S_OK; } // // TtsPlatformImplBackgroundWorker // void TtsPlatformImplBackgroundWorker::Initialize() { bool success = false; std::vector voices; ::CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_ALL, IID_PPV_ARGS(&speech_synthesizer_)); if (speech_synthesizer_.Get()) { ULONGLONG event_mask = SPFEI(SPEI_START_INPUT_STREAM) | SPFEI(SPEI_TTS_BOOKMARK) | SPFEI(SPEI_WORD_BOUNDARY) | SPFEI(SPEI_SENTENCE_BOUNDARY) | SPFEI(SPEI_END_INPUT_STREAM); speech_synthesizer_->SetInterest(event_mask, event_mask); speech_synthesizer_->SetNotifySink(tts_event_sink_.Get()); GetVoices(&voices); success = true; } GetUIThreadTaskRunner({})->PostTask( FROM_HERE, base::BindOnce(&TtsPlatformImplWin::OnInitializeComplete, base::Unretained(TtsPlatformImplWin::GetInstance()), success, std::move(voices))); } void TtsPlatformImplBackgroundWorker::ProcessSpeech( int utterance_id, const std::string& lang, const VoiceData& voice, const UtteranceContinuousParameters& params, base::OnceCallback on_speak_finished, const std::string& parsed_utterance) { DCHECK(speech_synthesizer_.Get()); SetVoiceFromName(voice.name); if (params.rate >= 0.0) { // Map our multiplicative range of 0.1x to 10.0x onto Microsoft's // linear range of -10 to 10: // 0.1 -> -10 // 1.0 -> 0 // 10.0 -> 10 speech_synthesizer_->SetRate(static_cast(10 * log10(params.rate))); } std::wstring prefix; std::wstring suffix; if (params.pitch >= 0.0) { // The TTS api allows a range of -10 to 10 for speech pitch: // https://docs.microsoft.com/en-us/previous-versions/windows/desktop/ms720500(v%3Dvs.85) // Note that the API requires an integer value, so be sure to cast the pitch // value to an int before calling NumberToString16. TODO(dtseng): cleanup if // we ever use any other properties that require xml. double adjusted_pitch = std::max(-10, std::min(params.pitch * 10 - 10, 10)); std::wstring adjusted_pitch_string = base::NumberToString16(static_cast(adjusted_pitch)); prefix = L""; suffix = L""; } if (params.volume >= 0.0) { // The TTS api allows a range of 0 to 100 for speech volume. speech_synthesizer_->SetVolume(static_cast(params.volume * 100)); } // TODO(dmazzoni): convert SSML to SAPI xml. http://crbug.com/88072 std::wstring utterance = base::UTF8ToWide(parsed_utterance); std::wstring merged_utterance = prefix + utterance + suffix; utterance_id_ = utterance_id; utterance_char_position_ = 0; utterance_length_ = utterance.size(); utterance_prefix_length_ = prefix.size(); tts_event_sink_->SetUtteranceId(utterance_id); HRESULT result = speech_synthesizer_->Speak(merged_utterance.c_str(), SPF_ASYNC, &stream_number_); bool success = (result == S_OK); GetUIThreadTaskRunner({})->PostTask( FROM_HERE, base::BindOnce(std::move(on_speak_finished), success)); } void TtsPlatformImplWin::FinishCurrentUtterance() { if (paused_) Resume(); DCHECK(is_speaking_); DCHECK_NE(utterance_id_, kInvalidUtteranceId); is_speaking_ = false; utterance_id_ = kInvalidUtteranceId; } void TtsPlatformImplBackgroundWorker::StopSpeaking(bool paused) { if (speech_synthesizer_.Get()) { // Block notifications from the current utterance. tts_event_sink_->SetUtteranceId(kInvalidUtteranceId); utterance_id_ = kInvalidUtteranceId; // Stop speech by speaking nullptr with the purge flag. speech_synthesizer_->Speak(nullptr, SPF_PURGEBEFORESPEAK, nullptr); // Ensures the synthesizer is not paused after a stop. if (paused) speech_synthesizer_->Resume(); } } void TtsPlatformImplBackgroundWorker::Pause() { if (speech_synthesizer_.Get()) { speech_synthesizer_->Pause(); SendTtsEvent(utterance_id_, TTS_EVENT_PAUSE, utterance_char_position_); } } void TtsPlatformImplBackgroundWorker::Resume() { if (speech_synthesizer_.Get()) { speech_synthesizer_->Resume(); SendTtsEvent(utterance_id_, TTS_EVENT_RESUME, utterance_char_position_); } } void TtsPlatformImplBackgroundWorker::Shutdown() { if (speech_synthesizer_) speech_synthesizer_->SetNotifySink(nullptr); if (tts_event_sink_) { tts_event_sink_->SetUtteranceId(kInvalidUtteranceId); utterance_id_ = kInvalidUtteranceId; } tts_event_sink_ = nullptr; speech_synthesizer_ = nullptr; } void TtsPlatformImplBackgroundWorker::OnSpeechEvent(int utterance_id) { if (!speech_synthesizer_.Get()) return; SPEVENT event; while (S_OK == speech_synthesizer_->GetEvents(1, &event, nullptr)) { // Ignore notifications that are not related to the current utterance. if (event.ulStreamNum != stream_number_ || utterance_id_ == kInvalidUtteranceId || utterance_id != utterance_id_) { continue; } switch (event.eEventId) { case SPEI_START_INPUT_STREAM: utterance_char_position_ = 0; SendTtsEvent(utterance_id_, TTS_EVENT_START, utterance_char_position_); break; case SPEI_END_INPUT_STREAM: GetUIThreadTaskRunner({})->PostTask( FROM_HERE, base::BindOnce(&TtsPlatformImplWin::OnSpeakFinished, base::Unretained(TtsPlatformImplWin::GetInstance()), utterance_id_)); utterance_char_position_ = utterance_length_; SendTtsEvent(utterance_id_, TTS_EVENT_END, utterance_char_position_); break; case SPEI_TTS_BOOKMARK: SendTtsEvent(utterance_id_, TTS_EVENT_MARKER, utterance_char_position_); break; case SPEI_WORD_BOUNDARY: utterance_char_position_ = static_cast(event.lParam) - utterance_prefix_length_; SendTtsEvent(utterance_id_, TTS_EVENT_WORD, utterance_char_position_, static_cast(event.wParam)); break; case SPEI_SENTENCE_BOUNDARY: utterance_char_position_ = static_cast(event.lParam) - utterance_prefix_length_; SendTtsEvent(utterance_id_, TTS_EVENT_SENTENCE, utterance_char_position_); break; default: break; } } } void TtsPlatformImplBackgroundWorker::SendTtsEvent(int utterance_id, TtsEventType event_type, int char_index, int length) { GetUIThreadTaskRunner({})->PostTask( FROM_HERE, base::BindOnce(&TtsController::OnTtsEvent, base::Unretained(TtsController::GetInstance()), utterance_id, event_type, char_index, length, std::string())); } void TtsPlatformImplBackgroundWorker::GetVoices( std::vector* out_voices) { if (!speech_synthesizer_.Get()) return; Microsoft::WRL::ComPtr voice_tokens; unsigned long voice_count; if (S_OK != SpEnumTokens(SPCAT_VOICES, NULL, NULL, &voice_tokens)) return; if (S_OK != voice_tokens->GetCount(&voice_count)) return; for (unsigned i = 0; i < voice_count; i++) { VoiceData voice; Microsoft::WRL::ComPtr voice_token; if (S_OK != voice_tokens->Next(1, &voice_token, NULL)) return; base::win::ScopedCoMem description; if (S_OK != SpGetDescription(voice_token.Get(), &description)) continue; voice.name = base::WideToUTF8(description.get()); Microsoft::WRL::ComPtr attributes; if (S_OK != voice_token->OpenKey(kAttributesKey, &attributes)) continue; base::win::ScopedCoMem language; if (S_OK == attributes->GetStringValue(kLanguageValue, &language)) { int lcid_value; base::HexStringToInt(base::WideToUTF8(language.get()), &lcid_value); LCID lcid = MAKELCID(lcid_value, SORT_DEFAULT); WCHAR locale_name[LOCALE_NAME_MAX_LENGTH] = {0}; LCIDToLocaleName(lcid, locale_name, LOCALE_NAME_MAX_LENGTH, 0); voice.lang = base::WideToUTF8(locale_name); } voice.native = true; voice.events.insert(TTS_EVENT_START); voice.events.insert(TTS_EVENT_END); voice.events.insert(TTS_EVENT_MARKER); voice.events.insert(TTS_EVENT_WORD); voice.events.insert(TTS_EVENT_SENTENCE); voice.events.insert(TTS_EVENT_PAUSE); voice.events.insert(TTS_EVENT_RESUME); out_voices->push_back(voice); } } void TtsPlatformImplBackgroundWorker::SetVoiceFromName( const std::string& name) { if (name.empty() || name == last_voice_name_) return; last_voice_name_ = name; Microsoft::WRL::ComPtr voice_tokens; unsigned long voice_count; if (S_OK != SpEnumTokens(SPCAT_VOICES, NULL, NULL, &voice_tokens)) return; if (S_OK != voice_tokens->GetCount(&voice_count)) return; for (unsigned i = 0; i < voice_count; i++) { Microsoft::WRL::ComPtr voice_token; if (S_OK != voice_tokens->Next(1, &voice_token, NULL)) return; base::win::ScopedCoMem description; if (S_OK != SpGetDescription(voice_token.Get(), &description)) continue; if (name == base::WideToUTF8(description.get())) { speech_synthesizer_->SetVoice(voice_token.Get()); break; } } } // // TtsPlatformImplWin // bool TtsPlatformImplWin::PlatformImplInitialized() { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); return platform_initialized_; } void TtsPlatformImplWin::Speak( int utterance_id, const std::string& utterance, const std::string& lang, const VoiceData& voice, const UtteranceContinuousParameters& params, base::OnceCallback on_speak_finished) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(platform_initialized_); // Do not emit utterance if the platform is not ready. if (paused_ || is_speaking_) { std::move(on_speak_finished).Run(false); return; } // Flag that a utterance is getting emitted. The |is_speaking_| flag will be // set back to false when the utterance will be fully spoken, stopped or if // the voice synthetizer was not able to emit it. is_speaking_ = true; utterance_id_ = utterance_id; // Parse SSML and process speech. TtsController::GetInstance()->StripSSML( utterance, base::BindOnce(&TtsPlatformImplWin::ProcessSpeech, base::Unretained(this), utterance_id, lang, voice, params, base::BindOnce(&TtsPlatformImplWin::OnSpeakScheduled, base::Unretained(this), std::move(on_speak_finished)))); } bool TtsPlatformImplWin::StopSpeaking() { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::StopSpeaking, paused_); paused_ = false; is_speaking_ = false; utterance_id_ = kInvalidUtteranceId; return true; } void TtsPlatformImplWin::Pause() { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(platform_initialized_); if (paused_ || !is_speaking_) return; worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::Pause); paused_ = true; } void TtsPlatformImplWin::Resume() { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(platform_initialized_); if (!paused_) return; worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::Resume); paused_ = false; } bool TtsPlatformImplWin::IsSpeaking() { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(platform_initialized_); return is_speaking_ && !paused_; } void TtsPlatformImplWin::GetVoices(std::vector* out_voices) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(platform_initialized_); out_voices->insert(out_voices->end(), voices_.begin(), voices_.end()); } void TtsPlatformImplWin::Shutdown() { // This is required to ensures the object is released before the COM is // uninitialized. Otherwise, this is causing shutdown hangs. worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::Shutdown); } void TtsPlatformImplWin::OnInitializeComplete(bool success, std::vector voices) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); if (success) voices_ = std::move(voices); platform_initialized_ = true; TtsController::GetInstance()->VoicesChanged(); } void TtsPlatformImplWin::OnSpeakScheduled( base::OnceCallback on_speak_finished, bool success) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(is_speaking_); // If the utterance was not able to be emitted, stop the speaking. There // won't be any asynchronous TTS event to confirm the end of the speech. if (!success) FinishCurrentUtterance(); // Pass the results to our caller. std::move(on_speak_finished).Run(success); } void TtsPlatformImplWin::OnSpeakFinished(int utterance_id) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); if (utterance_id != utterance_id_) return; FinishCurrentUtterance(); } void TtsPlatformImplWin::ProcessSpeech( int utterance_id, const std::string& lang, const VoiceData& voice, const UtteranceContinuousParameters& params, base::OnceCallback on_speak_finished, const std::string& parsed_utterance) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::ProcessSpeech, utterance_id, lang, voice, params, std::move(on_speak_finished), parsed_utterance); } TtsPlatformImplWin::TtsPlatformImplWin() : worker_task_runner_( base::ThreadPool::CreateSequencedTaskRunner({base::MayBlock()})), worker_(worker_task_runner_, worker_task_runner_) { DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI)); worker_.Post(FROM_HERE, &TtsPlatformImplBackgroundWorker::Initialize); } // static TtsPlatformImplWin* TtsPlatformImplWin::GetInstance() { static base::NoDestructor tts_platform; return tts_platform.get(); } } // namespace // static TtsPlatformImpl* TtsPlatformImpl::GetInstance() { return TtsPlatformImplWin::GetInstance(); } } // namespace content