1 /* ScummVM - Graphic Adventure Engine
2  *
3  * ScummVM is the legal property of its developers, whose names
4  * are too numerous to list here. Please refer to the COPYRIGHT
5  * file distributed with this source distribution.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20  *
21  */
22 
23 // Disable symbol overrides so that we can use system headers.
24 #define FORBIDDEN_SYMBOL_ALLOW_ALL
25 #ifdef HAVE_CONFIG_H
26 #include "config.h"
27 #endif
28 #if defined(USE_TTS) && defined(WIN32)
29 #include <basetyps.h>
30 #include <windows.h>
31 #include <servprov.h>
32 
33 #include <sapi.h>
34 #if _SAPI_VER < 0x53
35 #define SPF_PARSE_SAPI 0x80
36 #endif
37 
38 #include "backends/platform/sdl/win32/win32_wrapper.h"
39 
40 #include "backends/text-to-speech/windows/windows-text-to-speech.h"
41 
42 
43 #include "common/translation.h"
44 #include "common/system.h"
45 #include "common/ustr.h"
46 #include "common/config-manager.h"
47 
48 ISpVoice *_voice;
49 
50 // We need this pointer to be able to stop speech immediately.
51 ISpAudio *_audio;
52 
WindowsTextToSpeechManager()53 WindowsTextToSpeechManager::WindowsTextToSpeechManager()
54 	: _speechState(BROKEN){
55 	init();
56 	_threadParams.queue = &_speechQueue;
57 	_threadParams.state = &_speechState;
58 	_threadParams.mutex = &_speechMutex;
59 	_thread = NULL;
60 	_speechMutex = CreateMutex(NULL, FALSE, NULL);
61 	if (_speechMutex == NULL) {
62 		_speechState = BROKEN;
63 		warning("Could not create TTS mutex");
64 	}
65 }
66 
init()67 void WindowsTextToSpeechManager::init() {
68 	// init COM
69 	if (FAILED(::CoInitialize(NULL)))
70 		return;
71 
72 	// init audio
73 	ISpObjectTokenCategory *pTokenCategory;
74 	HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_ALL, IID_ISpObjectTokenCategory, (void **)&pTokenCategory);
75 	if (SUCCEEDED(hr)) {
76 		hr = pTokenCategory->SetId(SPCAT_AUDIOOUT, TRUE);
77 		if (SUCCEEDED(hr)) {
78 			WCHAR *tokenId;
79 			hr = pTokenCategory->GetDefaultTokenId(&tokenId);
80 			if (SUCCEEDED(hr)) {
81 				ISpObjectToken *pToken;
82 				hr = CoCreateInstance(CLSID_SpObjectToken, NULL, CLSCTX_ALL, IID_ISpObjectToken, (void **)&pToken);
83 				if (SUCCEEDED(hr)) {
84 					hr = pToken->SetId(NULL, tokenId, FALSE);
85 					if (SUCCEEDED(hr)) {
86 						hr = pToken->CreateInstance(NULL, CLSCTX_ALL, IID_ISpAudio, (void **)&_audio);
87 					}
88 				}
89 				CoTaskMemFree(tokenId);
90 			}
91 		}
92 	}
93 	if (FAILED(hr)) {
94 		warning("Could not initialize TTS audio");
95 		return;
96 	}
97 
98 	// init voice
99 	hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&_voice);
100 	if (FAILED(hr)) {
101 		warning("Could not initialize TTS voice");
102 		return;
103 	}
104 
105 	_speechState = NO_VOICE;
106 
107 #ifdef USE_TRANSLATION
108 	setLanguage(TransMan.getCurrentLanguage());
109 #else
110 	setLanguage("en");
111 #endif
112 
113 	_voice->SetOutput(_audio, FALSE);
114 
115 	if (!_ttsState->_availableVoices.empty())
116 		_speechState = READY;
117 	else
118 		_speechState = NO_VOICE;
119 	_lastSaid = "";
120 	while (!_speechQueue.empty()) {
121 		free(_speechQueue.front());
122 		_speechQueue.pop_front();
123 	}
124 }
125 
~WindowsTextToSpeechManager()126 WindowsTextToSpeechManager::~WindowsTextToSpeechManager() {
127 	stop();
128 
129 	clearState();
130 
131 	if (_thread != NULL) {
132 		WaitForSingleObject(_thread, INFINITE);
133 		CloseHandle(_thread);
134 	}
135 	if (_speechMutex != NULL) {
136 		CloseHandle(_speechMutex);
137 	}
138 	if (_voice)
139 		_voice->Release();
140 	::CoUninitialize();
141 }
142 
startSpeech(LPVOID parameters)143 DWORD WINAPI startSpeech(LPVOID parameters) {
144 	WindowsTextToSpeechManager::SpeechParameters *params =
145 		(WindowsTextToSpeechManager::SpeechParameters *) parameters;
146 	// wait for the previous speech, if the previous thread exited too early
147 	_voice->WaitUntilDone(INFINITE);
148 
149 	while (!params->queue->empty()) {
150 		WaitForSingleObject(*params->mutex, INFINITE);
151 		// check again, when we have exclusive access to the queue
152 		if (params->queue->empty() || *(params->state) == WindowsTextToSpeechManager::PAUSED) {
153 			ReleaseMutex(*params->mutex);
154 			break;
155 		}
156 		WCHAR *currentSpeech = params->queue->front();
157 		_voice->Speak(currentSpeech, SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_PARSE_SAPI, 0);
158 		ReleaseMutex(*params->mutex);
159 
160 		while (*(params->state) != WindowsTextToSpeechManager::PAUSED)
161 			if (_voice->WaitUntilDone(10) == S_OK)
162 				break;
163 
164 		WaitForSingleObject(*params->mutex, INFINITE);
165 		if (!params->queue->empty() && params->queue->front() == currentSpeech) {
166 			if (currentSpeech != NULL)
167 				free(currentSpeech);
168 			params->queue->pop_front();
169 		}
170 		ReleaseMutex(*params->mutex);
171 	}
172 
173 	WaitForSingleObject(*params->mutex, INFINITE);
174 	if (*(params->state) != WindowsTextToSpeechManager::PAUSED)
175 		*(params->state) = WindowsTextToSpeechManager::READY;
176 	ReleaseMutex(*params->mutex);
177 	return 0;
178 }
179 
say(const Common::U32String & str,Action action)180 bool WindowsTextToSpeechManager::say(const Common::U32String &str, Action action) {
181 	if (_speechState == BROKEN || _speechState == NO_VOICE) {
182 		warning("The text to speech cannot speak in this state");
183 		return true;
184 	}
185 
186 	if (isSpeaking() && action == DROP)
187 		return true;
188 
189 	// We have to set the pitch by prepending xml code at the start of the said string;
190 	Common::U32String pitch = Common::U32String::format("<pitch absmiddle=\"%d\"/>%S", _ttsState->_pitch / 10, str.c_str());
191 	WCHAR *strW = (WCHAR *) pitch.encodeUTF16Native();
192 	if (strW == nullptr) {
193 		warning("Cannot convert from UTF-32 encoding for text to speech");
194 		return true;
195 	}
196 
197 	WaitForSingleObject(_speechMutex, INFINITE);
198 	if (isSpeaking() && !_speechQueue.empty() && action == INTERRUPT_NO_REPEAT &&
199 			_speechQueue.front() != NULL && !wcscmp(_speechQueue.front(), strW)) {
200 		while (_speechQueue.size() != 1) {
201 			free(_speechQueue.back());
202 			_speechQueue.pop_back();
203 		}
204 		free(strW);
205 		ReleaseMutex(_speechMutex);
206 		return true;
207 	}
208 
209 	if (isSpeaking() && !_speechQueue.empty() && action == QUEUE_NO_REPEAT &&
210 			_speechQueue.front() != NULL &&!wcscmp(_speechQueue.back(), strW)) {
211 		ReleaseMutex(_speechMutex);
212 		return true;
213 	}
214 
215 	ReleaseMutex(_speechMutex);
216 	if ((isPaused() || isSpeaking()) && (action == INTERRUPT || action == INTERRUPT_NO_REPEAT)) {
217 		stop();
218 	}
219 
220 	WaitForSingleObject(_speechMutex, INFINITE);
221 	_speechQueue.push_back(strW);
222 	ReleaseMutex(_speechMutex);
223 
224 	if (!isSpeaking() && !isPaused()) {
225 		DWORD threadId;
226 		if (_thread != NULL) {
227 			WaitForSingleObject(_thread, INFINITE);
228 			CloseHandle(_thread);
229 		}
230 		_speechState = SPEAKING;
231 		_thread = CreateThread(NULL, 0, startSpeech, &_threadParams, 0, &threadId);
232 		if (_thread == NULL) {
233 			warning("Could not create speech thread");
234 			_speechState = READY;
235 			return true;
236 		}
237 	}
238 	return false;
239 }
240 
stop()241 bool WindowsTextToSpeechManager::stop() {
242 	if (_speechState == BROKEN || _speechState == NO_VOICE)
243 		return true;
244 	if (isPaused())
245 		resume();
246 	_audio->SetState(SPAS_STOP, 0);
247 	WaitForSingleObject(_speechMutex, INFINITE);
248 	// Delete the speech queue
249 	while (!_speechQueue.empty()) {
250 		if (_speechQueue.front() != NULL)
251 			free(_speechQueue.front());
252 		_speechQueue.pop_front();
253 	}
254 	// Stop the current speech
255 	_voice->Speak(NULL, SPF_PURGEBEFORESPEAK | SPF_ASYNC, 0);
256 	_speechState = READY;
257 	ReleaseMutex(_speechMutex);
258 	_audio->SetState(SPAS_RUN, 0);
259 	return false;
260 }
261 
pause()262 bool WindowsTextToSpeechManager::pause() {
263 	if (_speechState == BROKEN || _speechState == NO_VOICE)
264 		return true;
265 	if (isPaused())
266 		return false;
267 	WaitForSingleObject(_speechMutex, INFINITE);
268 	_voice->Pause();
269 	_speechState = PAUSED;
270 	ReleaseMutex(_speechMutex);
271 	return false;
272 }
273 
resume()274 bool WindowsTextToSpeechManager::resume() {
275 	if (_speechState == BROKEN || _speechState == NO_VOICE)
276 		return true;
277 	if (!isPaused())
278 		return false;
279 	_voice->Resume();
280 	DWORD threadId;
281 	if (_thread != NULL) {
282 		WaitForSingleObject(_thread, INFINITE);
283 		CloseHandle(_thread);
284 	}
285 	_speechState = SPEAKING;
286 	_thread = CreateThread(NULL, 0, startSpeech, &_threadParams, 0, &threadId);
287 	if (_thread == NULL) {
288 		warning("Could not create speech thread");
289 		_speechState = READY;
290 		return true;
291 	}
292 	return false;
293 }
294 
isSpeaking()295 bool WindowsTextToSpeechManager::isSpeaking() {
296 	return _speechState == SPEAKING;
297 }
298 
isPaused()299 bool WindowsTextToSpeechManager::isPaused() {
300 	return _speechState == PAUSED;
301 }
302 
isReady()303 bool WindowsTextToSpeechManager::isReady() {
304 	if (_speechState == BROKEN || _speechState == NO_VOICE)
305 		return false;
306 	if (_speechState != PAUSED && !isSpeaking())
307 		return true;
308 	else
309 		return false;
310 }
311 
setVoice(unsigned index)312 void WindowsTextToSpeechManager::setVoice(unsigned index) {
313 	if (_speechState == BROKEN || _speechState == NO_VOICE)
314 		return;
315 	_voice->SetVoice((ISpObjectToken *) _ttsState->_availableVoices[index].getData());
316 	_ttsState->_activeVoice = index;
317 }
318 
setRate(int rate)319 void WindowsTextToSpeechManager::setRate(int rate) {
320 	if (_speechState == BROKEN || _speechState == NO_VOICE)
321 		return;
322 	assert(rate >= -100 && rate <= 100);
323 	_voice->SetRate(rate / 10);
324 	_ttsState->_rate = rate;
325 }
326 
setPitch(int pitch)327 void WindowsTextToSpeechManager::setPitch(int pitch) {
328 	if (_speechState == BROKEN || _speechState == NO_VOICE)
329 		return;
330 	assert(pitch >= -100 && pitch <= 100);
331 	_ttsState->_pitch = pitch;
332 }
333 
setVolume(unsigned volume)334 void WindowsTextToSpeechManager::setVolume(unsigned volume) {
335 	if (_speechState == BROKEN || _speechState == NO_VOICE)
336 		return;
337 	assert(volume <= 100);
338 	_voice->SetVolume(volume);
339 	_ttsState->_volume = volume;
340 }
341 
setLanguage(Common::String language)342 void WindowsTextToSpeechManager::setLanguage(Common::String language) {
343 	Common::TextToSpeechManager::setLanguage(language);
344 	updateVoices();
345 	setVoice(0);
346 }
347 
createVoice(void * cpVoiceToken)348 void WindowsTextToSpeechManager::createVoice(void *cpVoiceToken) {
349 	ISpObjectToken *voiceToken = (ISpObjectToken *) cpVoiceToken;
350 
351 	// description
352 	WCHAR *descW;
353 	char *buffer;
354 	Common::String desc;
355 	HRESULT hr = voiceToken->GetStringValue(NULL, &descW);
356 	if (SUCCEEDED(hr)) {
357 		buffer = Win32::unicodeToAnsi(descW);
358 		desc = buffer;
359 		free(buffer);
360 		CoTaskMemFree(descW);
361 	}
362 
363 	if (desc == "Sample TTS Voice") {
364 		// This is really bad voice, it is basicaly unusable
365 		return;
366 	}
367 
368 	// voice attributes
369 	ISpDataKey *key = nullptr;
370 	hr = voiceToken->OpenKey(L"Attributes", &key);
371 
372 	if (FAILED(hr)) {
373 		voiceToken->Release();
374 		warning("Could not open attribute key for voice: %s", desc.c_str());
375 		return;
376 	}
377 	LPWSTR data;
378 
379 	// language
380 	hr = key->GetStringValue(L"Language", &data);
381 	if (FAILED(hr)) {
382 		voiceToken->Release();
383 		warning("Could not get the language attribute for voice: %s", desc.c_str());
384 		return;
385 	}
386 	Common::String language = lcidToLocale(wcstol(data, NULL, 16));
387 	CoTaskMemFree(data);
388 
389 	// only get the voices for the current language
390 	if (language != _ttsState->_language) {
391 		voiceToken->Release();
392 		return;
393 	}
394 
395 	// gender
396 	hr = key->GetStringValue(L"Gender", &data);
397 	if (FAILED(hr)) {
398 		voiceToken->Release();
399 		warning("Could not get the gender attribute for voice: %s", desc.c_str());
400 		return;
401 	}
402 	Common::TTSVoice::Gender gender = !wcscmp(data, L"Male") ? Common::TTSVoice::MALE : Common::TTSVoice::FEMALE;
403 	CoTaskMemFree(data);
404 
405 	// age
406 	hr = key->GetStringValue(L"Age", &data);
407 	if (FAILED(hr)) {
408 		voiceToken->Release();
409 		warning("Could not get the age attribute for voice: %s", desc.c_str());
410 		return;
411 	}
412 	Common::TTSVoice::Age age = !wcscmp(data, L"Adult") ? Common::TTSVoice::ADULT : Common::TTSVoice::UNKNOWN_AGE;
413 	CoTaskMemFree(data);
414 
415 	_ttsState->_availableVoices.push_back(Common::TTSVoice(gender, age, (void *) voiceToken, desc));
416 }
417 
lcidToLocale(LCID locale)418 Common::String WindowsTextToSpeechManager::lcidToLocale(LCID locale) {
419 	int nchars = GetLocaleInfo(locale, LOCALE_SISO639LANGNAME, NULL, 0);
420 	TCHAR *languageCode = new TCHAR[nchars];
421 	GetLocaleInfo(locale, LOCALE_SISO639LANGNAME, languageCode, nchars);
422 	Common::String result = Win32::tcharToString(languageCode);
423 	delete[] languageCode;
424 	return result;
425 }
426 
updateVoices()427 void WindowsTextToSpeechManager::updateVoices() {
428 	if (_speechState == BROKEN)
429 		return;
430 	_ttsState->_availableVoices.clear();
431 	ISpObjectToken *cpVoiceToken = nullptr;
432 	IEnumSpObjectTokens *cpEnum = nullptr;
433 	unsigned long ulCount = 0;
434 
435 	ISpObjectTokenCategory *cpCategory;
436 	HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_ALL, IID_ISpObjectTokenCategory, (void**)&cpCategory);
437 	if (SUCCEEDED(hr)) {
438 		hr = cpCategory->SetId(L"HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech_OneCore\\Voices", FALSE);
439 		if (!SUCCEEDED(hr)) {
440 			hr = cpCategory->SetId(SPCAT_VOICES, FALSE);
441 		}
442 
443 		if (SUCCEEDED(hr)) {
444 			hr = cpCategory->EnumTokens(NULL, NULL, &cpEnum);
445 		}
446 	}
447 
448 	if (SUCCEEDED(hr)) {
449 		hr = cpEnum->GetCount(&ulCount);
450 	}
451 	_voice->SetVolume(0);
452 	while (SUCCEEDED(hr) && ulCount--) {
453 		hr = cpEnum->Next(1, &cpVoiceToken, NULL);
454 		_voice->SetVoice(cpVoiceToken);
455 		if (SUCCEEDED(_voice->Speak(L"hi, this is test", SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_IS_NOT_XML, 0)))
456 			createVoice(cpVoiceToken);
457 		else
458 			cpVoiceToken->Release();
459 	}
460 	// stop the test speech, we don't use stop(), because we don't wan't it to set state to READY
461 	// and we could easily be in NO_VOICE or BROKEN state here, in which the stop() wouldn't work
462 	_audio->SetState(SPAS_STOP, 0);
463 	_audio->SetState(SPAS_RUN, 0);
464 	_voice->Speak(NULL, SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_IS_NOT_XML, 0);
465 	_voice->SetVolume(_ttsState->_volume);
466 	cpEnum->Release();
467 
468 	if (_ttsState->_availableVoices.empty()) {
469 		_speechState = NO_VOICE;
470 		warning("No voice is available for language: %s", _ttsState->_language.c_str());
471 	} else if (_speechState == NO_VOICE)
472 		_speechState = READY;
473 }
474 
freeVoiceData(void * data)475 void WindowsTextToSpeechManager::freeVoiceData(void *data) {
476 	ISpObjectToken *voiceToken = (ISpObjectToken *) data;
477 	voiceToken->Release();
478 }
479 
480 #endif
481