1 /* ScummVM - Graphic Adventure Engine
2 *
3 * ScummVM is the legal property of its developers, whose names
4 * are too numerous to list here. Please refer to the COPYRIGHT
5 * file distributed with this source distribution.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 */
22
23 // Disable symbol overrides so that we can use system headers.
24 #define FORBIDDEN_SYMBOL_ALLOW_ALL
25 #ifdef HAVE_CONFIG_H
26 #include "config.h"
27 #endif
28 #if defined(USE_TTS) && defined(WIN32)
29 #include <basetyps.h>
30 #include <windows.h>
31 #include <servprov.h>
32
33 #include <sapi.h>
34 #if _SAPI_VER < 0x53
35 #define SPF_PARSE_SAPI 0x80
36 #endif
37
38 #include "backends/platform/sdl/win32/win32_wrapper.h"
39
40 #include "backends/text-to-speech/windows/windows-text-to-speech.h"
41
42
43 #include "common/translation.h"
44 #include "common/system.h"
45 #include "common/ustr.h"
46 #include "common/config-manager.h"
47
48 ISpVoice *_voice;
49
50 // We need this pointer to be able to stop speech immediately.
51 ISpAudio *_audio;
52
WindowsTextToSpeechManager()53 WindowsTextToSpeechManager::WindowsTextToSpeechManager()
54 : _speechState(BROKEN){
55 init();
56 _threadParams.queue = &_speechQueue;
57 _threadParams.state = &_speechState;
58 _threadParams.mutex = &_speechMutex;
59 _thread = NULL;
60 _speechMutex = CreateMutex(NULL, FALSE, NULL);
61 if (_speechMutex == NULL) {
62 _speechState = BROKEN;
63 warning("Could not create TTS mutex");
64 }
65 }
66
init()67 void WindowsTextToSpeechManager::init() {
68 // init COM
69 if (FAILED(::CoInitialize(NULL)))
70 return;
71
72 // init audio
73 ISpObjectTokenCategory *pTokenCategory;
74 HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_ALL, IID_ISpObjectTokenCategory, (void **)&pTokenCategory);
75 if (SUCCEEDED(hr)) {
76 hr = pTokenCategory->SetId(SPCAT_AUDIOOUT, TRUE);
77 if (SUCCEEDED(hr)) {
78 WCHAR *tokenId;
79 hr = pTokenCategory->GetDefaultTokenId(&tokenId);
80 if (SUCCEEDED(hr)) {
81 ISpObjectToken *pToken;
82 hr = CoCreateInstance(CLSID_SpObjectToken, NULL, CLSCTX_ALL, IID_ISpObjectToken, (void **)&pToken);
83 if (SUCCEEDED(hr)) {
84 hr = pToken->SetId(NULL, tokenId, FALSE);
85 if (SUCCEEDED(hr)) {
86 hr = pToken->CreateInstance(NULL, CLSCTX_ALL, IID_ISpAudio, (void **)&_audio);
87 }
88 }
89 CoTaskMemFree(tokenId);
90 }
91 }
92 }
93 if (FAILED(hr)) {
94 warning("Could not initialize TTS audio");
95 return;
96 }
97
98 // init voice
99 hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&_voice);
100 if (FAILED(hr)) {
101 warning("Could not initialize TTS voice");
102 return;
103 }
104
105 _speechState = NO_VOICE;
106
107 #ifdef USE_TRANSLATION
108 setLanguage(TransMan.getCurrentLanguage());
109 #else
110 setLanguage("en");
111 #endif
112
113 _voice->SetOutput(_audio, FALSE);
114
115 if (!_ttsState->_availableVoices.empty())
116 _speechState = READY;
117 else
118 _speechState = NO_VOICE;
119 _lastSaid = "";
120 while (!_speechQueue.empty()) {
121 free(_speechQueue.front());
122 _speechQueue.pop_front();
123 }
124 }
125
~WindowsTextToSpeechManager()126 WindowsTextToSpeechManager::~WindowsTextToSpeechManager() {
127 stop();
128
129 clearState();
130
131 if (_thread != NULL) {
132 WaitForSingleObject(_thread, INFINITE);
133 CloseHandle(_thread);
134 }
135 if (_speechMutex != NULL) {
136 CloseHandle(_speechMutex);
137 }
138 if (_voice)
139 _voice->Release();
140 ::CoUninitialize();
141 }
142
startSpeech(LPVOID parameters)143 DWORD WINAPI startSpeech(LPVOID parameters) {
144 WindowsTextToSpeechManager::SpeechParameters *params =
145 (WindowsTextToSpeechManager::SpeechParameters *) parameters;
146 // wait for the previous speech, if the previous thread exited too early
147 _voice->WaitUntilDone(INFINITE);
148
149 while (!params->queue->empty()) {
150 WaitForSingleObject(*params->mutex, INFINITE);
151 // check again, when we have exclusive access to the queue
152 if (params->queue->empty() || *(params->state) == WindowsTextToSpeechManager::PAUSED) {
153 ReleaseMutex(*params->mutex);
154 break;
155 }
156 WCHAR *currentSpeech = params->queue->front();
157 _voice->Speak(currentSpeech, SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_PARSE_SAPI, 0);
158 ReleaseMutex(*params->mutex);
159
160 while (*(params->state) != WindowsTextToSpeechManager::PAUSED)
161 if (_voice->WaitUntilDone(10) == S_OK)
162 break;
163
164 WaitForSingleObject(*params->mutex, INFINITE);
165 if (!params->queue->empty() && params->queue->front() == currentSpeech) {
166 if (currentSpeech != NULL)
167 free(currentSpeech);
168 params->queue->pop_front();
169 }
170 ReleaseMutex(*params->mutex);
171 }
172
173 WaitForSingleObject(*params->mutex, INFINITE);
174 if (*(params->state) != WindowsTextToSpeechManager::PAUSED)
175 *(params->state) = WindowsTextToSpeechManager::READY;
176 ReleaseMutex(*params->mutex);
177 return 0;
178 }
179
say(const Common::U32String & str,Action action)180 bool WindowsTextToSpeechManager::say(const Common::U32String &str, Action action) {
181 if (_speechState == BROKEN || _speechState == NO_VOICE) {
182 warning("The text to speech cannot speak in this state");
183 return true;
184 }
185
186 if (isSpeaking() && action == DROP)
187 return true;
188
189 // We have to set the pitch by prepending xml code at the start of the said string;
190 Common::U32String pitch = Common::U32String::format("<pitch absmiddle=\"%d\"/>%S", _ttsState->_pitch / 10, str.c_str());
191 WCHAR *strW = (WCHAR *) pitch.encodeUTF16Native();
192 if (strW == nullptr) {
193 warning("Cannot convert from UTF-32 encoding for text to speech");
194 return true;
195 }
196
197 WaitForSingleObject(_speechMutex, INFINITE);
198 if (isSpeaking() && !_speechQueue.empty() && action == INTERRUPT_NO_REPEAT &&
199 _speechQueue.front() != NULL && !wcscmp(_speechQueue.front(), strW)) {
200 while (_speechQueue.size() != 1) {
201 free(_speechQueue.back());
202 _speechQueue.pop_back();
203 }
204 free(strW);
205 ReleaseMutex(_speechMutex);
206 return true;
207 }
208
209 if (isSpeaking() && !_speechQueue.empty() && action == QUEUE_NO_REPEAT &&
210 _speechQueue.front() != NULL &&!wcscmp(_speechQueue.back(), strW)) {
211 ReleaseMutex(_speechMutex);
212 return true;
213 }
214
215 ReleaseMutex(_speechMutex);
216 if ((isPaused() || isSpeaking()) && (action == INTERRUPT || action == INTERRUPT_NO_REPEAT)) {
217 stop();
218 }
219
220 WaitForSingleObject(_speechMutex, INFINITE);
221 _speechQueue.push_back(strW);
222 ReleaseMutex(_speechMutex);
223
224 if (!isSpeaking() && !isPaused()) {
225 DWORD threadId;
226 if (_thread != NULL) {
227 WaitForSingleObject(_thread, INFINITE);
228 CloseHandle(_thread);
229 }
230 _speechState = SPEAKING;
231 _thread = CreateThread(NULL, 0, startSpeech, &_threadParams, 0, &threadId);
232 if (_thread == NULL) {
233 warning("Could not create speech thread");
234 _speechState = READY;
235 return true;
236 }
237 }
238 return false;
239 }
240
stop()241 bool WindowsTextToSpeechManager::stop() {
242 if (_speechState == BROKEN || _speechState == NO_VOICE)
243 return true;
244 if (isPaused())
245 resume();
246 _audio->SetState(SPAS_STOP, 0);
247 WaitForSingleObject(_speechMutex, INFINITE);
248 // Delete the speech queue
249 while (!_speechQueue.empty()) {
250 if (_speechQueue.front() != NULL)
251 free(_speechQueue.front());
252 _speechQueue.pop_front();
253 }
254 // Stop the current speech
255 _voice->Speak(NULL, SPF_PURGEBEFORESPEAK | SPF_ASYNC, 0);
256 _speechState = READY;
257 ReleaseMutex(_speechMutex);
258 _audio->SetState(SPAS_RUN, 0);
259 return false;
260 }
261
pause()262 bool WindowsTextToSpeechManager::pause() {
263 if (_speechState == BROKEN || _speechState == NO_VOICE)
264 return true;
265 if (isPaused())
266 return false;
267 WaitForSingleObject(_speechMutex, INFINITE);
268 _voice->Pause();
269 _speechState = PAUSED;
270 ReleaseMutex(_speechMutex);
271 return false;
272 }
273
resume()274 bool WindowsTextToSpeechManager::resume() {
275 if (_speechState == BROKEN || _speechState == NO_VOICE)
276 return true;
277 if (!isPaused())
278 return false;
279 _voice->Resume();
280 DWORD threadId;
281 if (_thread != NULL) {
282 WaitForSingleObject(_thread, INFINITE);
283 CloseHandle(_thread);
284 }
285 _speechState = SPEAKING;
286 _thread = CreateThread(NULL, 0, startSpeech, &_threadParams, 0, &threadId);
287 if (_thread == NULL) {
288 warning("Could not create speech thread");
289 _speechState = READY;
290 return true;
291 }
292 return false;
293 }
294
isSpeaking()295 bool WindowsTextToSpeechManager::isSpeaking() {
296 return _speechState == SPEAKING;
297 }
298
isPaused()299 bool WindowsTextToSpeechManager::isPaused() {
300 return _speechState == PAUSED;
301 }
302
isReady()303 bool WindowsTextToSpeechManager::isReady() {
304 if (_speechState == BROKEN || _speechState == NO_VOICE)
305 return false;
306 if (_speechState != PAUSED && !isSpeaking())
307 return true;
308 else
309 return false;
310 }
311
setVoice(unsigned index)312 void WindowsTextToSpeechManager::setVoice(unsigned index) {
313 if (_speechState == BROKEN || _speechState == NO_VOICE)
314 return;
315 _voice->SetVoice((ISpObjectToken *) _ttsState->_availableVoices[index].getData());
316 _ttsState->_activeVoice = index;
317 }
318
setRate(int rate)319 void WindowsTextToSpeechManager::setRate(int rate) {
320 if (_speechState == BROKEN || _speechState == NO_VOICE)
321 return;
322 assert(rate >= -100 && rate <= 100);
323 _voice->SetRate(rate / 10);
324 _ttsState->_rate = rate;
325 }
326
setPitch(int pitch)327 void WindowsTextToSpeechManager::setPitch(int pitch) {
328 if (_speechState == BROKEN || _speechState == NO_VOICE)
329 return;
330 assert(pitch >= -100 && pitch <= 100);
331 _ttsState->_pitch = pitch;
332 }
333
setVolume(unsigned volume)334 void WindowsTextToSpeechManager::setVolume(unsigned volume) {
335 if (_speechState == BROKEN || _speechState == NO_VOICE)
336 return;
337 assert(volume <= 100);
338 _voice->SetVolume(volume);
339 _ttsState->_volume = volume;
340 }
341
setLanguage(Common::String language)342 void WindowsTextToSpeechManager::setLanguage(Common::String language) {
343 Common::TextToSpeechManager::setLanguage(language);
344 updateVoices();
345 setVoice(0);
346 }
347
createVoice(void * cpVoiceToken)348 void WindowsTextToSpeechManager::createVoice(void *cpVoiceToken) {
349 ISpObjectToken *voiceToken = (ISpObjectToken *) cpVoiceToken;
350
351 // description
352 WCHAR *descW;
353 char *buffer;
354 Common::String desc;
355 HRESULT hr = voiceToken->GetStringValue(NULL, &descW);
356 if (SUCCEEDED(hr)) {
357 buffer = Win32::unicodeToAnsi(descW);
358 desc = buffer;
359 free(buffer);
360 CoTaskMemFree(descW);
361 }
362
363 if (desc == "Sample TTS Voice") {
364 // This is really bad voice, it is basicaly unusable
365 return;
366 }
367
368 // voice attributes
369 ISpDataKey *key = nullptr;
370 hr = voiceToken->OpenKey(L"Attributes", &key);
371
372 if (FAILED(hr)) {
373 voiceToken->Release();
374 warning("Could not open attribute key for voice: %s", desc.c_str());
375 return;
376 }
377 LPWSTR data;
378
379 // language
380 hr = key->GetStringValue(L"Language", &data);
381 if (FAILED(hr)) {
382 voiceToken->Release();
383 warning("Could not get the language attribute for voice: %s", desc.c_str());
384 return;
385 }
386 Common::String language = lcidToLocale(wcstol(data, NULL, 16));
387 CoTaskMemFree(data);
388
389 // only get the voices for the current language
390 if (language != _ttsState->_language) {
391 voiceToken->Release();
392 return;
393 }
394
395 // gender
396 hr = key->GetStringValue(L"Gender", &data);
397 if (FAILED(hr)) {
398 voiceToken->Release();
399 warning("Could not get the gender attribute for voice: %s", desc.c_str());
400 return;
401 }
402 Common::TTSVoice::Gender gender = !wcscmp(data, L"Male") ? Common::TTSVoice::MALE : Common::TTSVoice::FEMALE;
403 CoTaskMemFree(data);
404
405 // age
406 hr = key->GetStringValue(L"Age", &data);
407 if (FAILED(hr)) {
408 voiceToken->Release();
409 warning("Could not get the age attribute for voice: %s", desc.c_str());
410 return;
411 }
412 Common::TTSVoice::Age age = !wcscmp(data, L"Adult") ? Common::TTSVoice::ADULT : Common::TTSVoice::UNKNOWN_AGE;
413 CoTaskMemFree(data);
414
415 _ttsState->_availableVoices.push_back(Common::TTSVoice(gender, age, (void *) voiceToken, desc));
416 }
417
lcidToLocale(LCID locale)418 Common::String WindowsTextToSpeechManager::lcidToLocale(LCID locale) {
419 int nchars = GetLocaleInfo(locale, LOCALE_SISO639LANGNAME, NULL, 0);
420 TCHAR *languageCode = new TCHAR[nchars];
421 GetLocaleInfo(locale, LOCALE_SISO639LANGNAME, languageCode, nchars);
422 Common::String result = Win32::tcharToString(languageCode);
423 delete[] languageCode;
424 return result;
425 }
426
updateVoices()427 void WindowsTextToSpeechManager::updateVoices() {
428 if (_speechState == BROKEN)
429 return;
430 _ttsState->_availableVoices.clear();
431 ISpObjectToken *cpVoiceToken = nullptr;
432 IEnumSpObjectTokens *cpEnum = nullptr;
433 unsigned long ulCount = 0;
434
435 ISpObjectTokenCategory *cpCategory;
436 HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_ALL, IID_ISpObjectTokenCategory, (void**)&cpCategory);
437 if (SUCCEEDED(hr)) {
438 hr = cpCategory->SetId(L"HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech_OneCore\\Voices", FALSE);
439 if (!SUCCEEDED(hr)) {
440 hr = cpCategory->SetId(SPCAT_VOICES, FALSE);
441 }
442
443 if (SUCCEEDED(hr)) {
444 hr = cpCategory->EnumTokens(NULL, NULL, &cpEnum);
445 }
446 }
447
448 if (SUCCEEDED(hr)) {
449 hr = cpEnum->GetCount(&ulCount);
450 }
451 _voice->SetVolume(0);
452 while (SUCCEEDED(hr) && ulCount--) {
453 hr = cpEnum->Next(1, &cpVoiceToken, NULL);
454 _voice->SetVoice(cpVoiceToken);
455 if (SUCCEEDED(_voice->Speak(L"hi, this is test", SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_IS_NOT_XML, 0)))
456 createVoice(cpVoiceToken);
457 else
458 cpVoiceToken->Release();
459 }
460 // stop the test speech, we don't use stop(), because we don't wan't it to set state to READY
461 // and we could easily be in NO_VOICE or BROKEN state here, in which the stop() wouldn't work
462 _audio->SetState(SPAS_STOP, 0);
463 _audio->SetState(SPAS_RUN, 0);
464 _voice->Speak(NULL, SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_IS_NOT_XML, 0);
465 _voice->SetVolume(_ttsState->_volume);
466 cpEnum->Release();
467
468 if (_ttsState->_availableVoices.empty()) {
469 _speechState = NO_VOICE;
470 warning("No voice is available for language: %s", _ttsState->_language.c_str());
471 } else if (_speechState == NO_VOICE)
472 _speechState = READY;
473 }
474
freeVoiceData(void * data)475 void WindowsTextToSpeechManager::freeVoiceData(void *data) {
476 ISpObjectToken *voiceToken = (ISpObjectToken *) data;
477 voiceToken->Release();
478 }
479
480 #endif
481