1 /*************************************************************************/
2 /* */
3 /* Language Technologies Institute */
4 /* Carnegie Mellon University */
5 /* Copyright (c) 2009 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author: Alan W Black (awb@cs.cmu.edu) */
34 /* Date: January 2009 */
35 /*************************************************************************/
36 /* */
37 /* flowm functions for flite access */
38 /* */
39 /*************************************************************************/
40
41 #include <windows.h>
42 #include <commctrl.h>
43 #include <aygshell.h>
44
45 #include "cst_wchar.h"
46 #include "flite.h"
47 #include "flowm.h"
48
49 /* For debugging its sometimes good to switch off the actual synthesis */
50 #define DOTTS 1
51
52 static cst_audiodev *fl_ad = 0;
53
54 #ifdef DOTTS
55 cst_voice *register_cmu_us_kal(const char *voxdir);
56 void unregister_cmu_us_kal(cst_voice *v);
57 cst_voice *register_cmu_us_awb(const char *voxdir);
58 void unregister_cmu_us_awb(cst_voice *v);
59 cst_voice *register_cmu_us_rms(const char *voxdir);
60 void unregister_cmu_us_rms(cst_voice *v);
61 cst_voice *register_cmu_us_slt(const char *voxdir);
62 void unregister_cmu_us_slt(cst_voice *v);
63 #endif
64
65 cst_wave *previous_wave = NULL;
66
67 typedef struct VoxDef_struct
68 {
69 TCHAR *name;
70 cst_voice *(*rv)(const char *voxdir); /* register_voice */
71 void (*urv)(cst_voice *v); /* unregister_voice */
72 int min_buffsize; /* for audio streaming */
73 cst_voice *v;
74 } VoxDef;
75
76 VoxDef VoxDefs[] = {
77 #ifdef cmu_us_kal
78 { L"kal", register_cmu_us_kal, unregister_cmu_us_kal, 256, NULL },
79 #endif
80 #ifdef cmu_us_awb
81 { L"awb", register_cmu_us_awb, unregister_cmu_us_awb, 2000, NULL },
82 #endif
83 #ifdef cmu_us_rms
84 { L"rms", register_cmu_us_rms, unregister_cmu_us_rms, 2000, NULL },
85 #endif
86 #ifdef cmu_us_slt
87 { L"slt", register_cmu_us_slt, unregister_cmu_us_slt, 2000, NULL },
88 #endif
89 { NULL, NULL }
90 };
91
92 cst_utterance *flowm_print_relation_callback(cst_utterance *u);
93 cst_utterance *flowm_utt_callback(cst_utterance *u);
94 int flowm_audio_stream_chunk(const cst_wave *w, int start, int size,
95 int last, cst_audio_streaming_info *asi);
96
flowm_find_file_percentage()97 float flowm_find_file_percentage()
98 {
99 if (flowm_file_size <= 0)
100 return 0.0;
101 else
102 return (flowm_file_pos*100.0)/flowm_file_size;
103 }
104
flowm_voice_name(int i)105 TCHAR *flowm_voice_name(int i)
106 {
107 /* In order not to have flite things in flowm_main, we provide an */
108 /* interface to the voice list */
109 return VoxDefs[i].name;
110 }
111
flowm_init()112 void flowm_init()
113 {
114 #ifdef DOTTS
115 int i;
116 cst_audio_streaming_info *asi;
117
118 flite_init(); /* Initialize flite interface */
119
120 for (i=0; VoxDefs[i].name; i++)
121 {
122 VoxDefs[i].v = (VoxDefs[i].rv)(NULL); /* register voice */
123
124 /* Set up call back function for low level audio streaming */
125 /* This way it plays the waveform as it synthesizes it */
126 /* This is necessary for the slower (CG) voices */
127 asi = new_audio_streaming_info();
128 asi->asc = flowm_audio_stream_chunk;
129 asi->min_buffsize = VoxDefs[i].min_buffsize;
130 feat_set(VoxDefs[i].v->features,
131 "streaming_info",
132 audio_streaming_info_val(asi));
133
134 /* Set up call back function for sending what tokens are being */
135 /* synthesized and for keeping track of the current position in */
136 /* the file */
137 feat_set(VoxDefs[i].v->features,
138 "utt_user_callback",
139 uttfunc_val(flowm_utt_callback));
140
141 /* For outputing results of a relation (only used in play) */
142 feat_set(VoxDefs[i].v->features,
143 "post_synth_hook_func",
144 uttfunc_val(flowm_print_relation_callback));
145 }
146
147 #endif
148 return;
149 }
150
flowm_terminate()151 void flowm_terminate()
152 {
153 #ifdef DOTTS
154 int i;
155
156 for (i=0; VoxDefs[i].name; i++)
157 {
158 (VoxDefs[i].urv)(VoxDefs[i].v); /* unregister voice */
159 }
160 #endif
161 if (previous_wave)
162 {
163 delete_wave(previous_wave);
164 previous_wave = NULL;
165 }
166
167 return;
168 }
169
flowm_save_wave(TCHAR * filename)170 int flowm_save_wave(TCHAR *filename)
171 {
172 /* Save the Last synthesized waveform file to filename */
173 char *sfilename;
174 int rc;
175
176 if (!previous_wave)
177 return -1;
178
179 sfilename = cst_wstr2cstr(filename);
180 rc = cst_wave_save_riff(previous_wave,sfilename);
181 cst_free(sfilename);
182
183 return rc;
184 }
185
186 #ifdef DOTTS
flowm_say_text(TCHAR * text)187 int flowm_say_text(TCHAR *text)
188 {
189 char *s;
190 int ns;
191 cst_voice *v;
192
193 if (previous_wave)
194 {
195 delete_wave(previous_wave);
196 previous_wave = NULL;
197 }
198
199 s = cst_wstr2cstr(text); /* text to synthesize */
200 v = VoxDefs[flowm_selected_voice].v; /* voice to synthesize with */
201
202 feat_remove(v->features,"print_info_relation");
203 if (flowm_selected_relation == 1)
204 feat_set_string(v->features, "print_info_relation", "Word");
205 if (flowm_selected_relation == 2)
206 feat_set_string(v->features, "print_info_relation", "Segment");
207
208 /* Do the synthesis */
209 previous_wave = flite_text_to_wave(s,v);
210
211 ns = cst_wave_num_samples(previous_wave);
212
213 cst_free(s);
214 audio_flush(fl_ad);
215 audio_close(fl_ad);
216 fl_ad = NULL;
217
218 return ns;
219 }
220 #else
flowm_say_text(TCHAR * text)221 int flowm_say_text(TCHAR *text)
222 {
223 MessageBoxW(0,text,L"SayText",0);
224 return 0;
225 }
226 #endif
227
flowm_print_relation_callback(cst_utterance * u)228 cst_utterance *flowm_print_relation_callback(cst_utterance *u)
229 {
230 /* Say the details of a named relation for display */
231 char rst[FL_MAX_MSG_CHARS];
232 const char *name;
233 const char *relname;
234 cst_item *item;
235 char *space;
236
237 space = "";
238 relname = get_param_string(u->features,"print_info_relation", NULL);
239 cst_sprintf(rst,"%s: ",relname);
240
241 if (!relname)
242 {
243 mbstowcs(fl_tts_msg,"",FL_MAX_MSG_CHARS);
244 return u;
245 }
246
247 for (item=relation_head(utt_relation(u,relname));
248 item; item=item_next(item))
249 {
250 name = item_feat_string(item,"name");
251
252 if (cst_strlen(name)+1+4 < FL_MAX_MSG_CHARS)
253 cst_sprintf(rst,"%s%s%s",rst,space,name);
254 else if (cst_strlen(rst)+4 < FL_MAX_MSG_CHARS)
255 cst_sprintf(rst,"%s ...",rst);
256 else
257 break;
258 space = " ";
259 }
260 mbstowcs(fl_tts_msg,rst,FL_MAX_MSG_CHARS);
261
262 return u;
263 }
264
flowm_utt_callback(cst_utterance * u)265 cst_utterance *flowm_utt_callback(cst_utterance *u)
266 {
267 char rst[FL_MAX_MSG_CHARS];
268 const char *tok;
269 cst_item *item;
270 char *space;
271 int extend_length;
272
273 /* In order to stop the synthesizer if the STOP button is pressed */
274 /* This stops the synthesis of the next utterance */
275
276 if ((flowm_play_status == FLOWM_PLAY) ||
277 (flowm_play_status == FLOWM_SKIP))
278 {
279 if (TTSWindow)
280 {
281 rst[0] = '\0';
282 space = "";
283 for (item=relation_head(utt_relation(u,"Token"));
284 item; item=item_next(item))
285 {
286 tok = item_feat_string(item,"name");
287 if (cst_streq("",space))
288 /* Only do this on the first token/word */
289 flowm_file_pos = item_feat_int(item,"file_pos");
290 extend_length = cst_strlen(rst) + 1 +
291 cst_strlen(item_feat_string(item,"prepunctuation"))+
292 cst_strlen(item_feat_string(item,"punc"));
293 if (cst_strlen(tok)+extend_length+4 < FL_MAX_MSG_CHARS)
294 cst_sprintf(rst,"%s%s%s%s%s",rst,space,
295 item_feat_string(item,"prepunctuation"),
296 tok,
297 item_feat_string(item,"punc"));
298 else
299 {
300 if (cst_strlen(rst)+4 < FL_MAX_MSG_CHARS)
301 cst_sprintf(rst,"%s ...",rst);
302 break;
303 }
304 space = " ";
305 }
306
307 if (flowm_file_pos > flowm_prev_utt_pos[flowm_utt_pos_pos])
308 {
309 if ((flowm_utt_pos_pos+1) >= FLOWM_NUM_UTT_POS)
310 {
311 /* Filled it up, so move it down */
312 memmove(flowm_prev_utt_pos,&flowm_prev_utt_pos[1],
313 sizeof(int)*(FLOWM_NUM_UTT_POS-10));
314 flowm_utt_pos_pos = (FLOWM_NUM_UTT_POS-10);
315 }
316 flowm_utt_pos_pos++;
317 flowm_prev_utt_pos[flowm_utt_pos_pos] = flowm_file_pos;
318 }
319
320 /* Send text to TTSWindow */
321 mbstowcs(fl_tts_msg,rst,FL_MAX_MSG_CHARS);
322 SetDlgItemText(TTSWindow, FL_SYNTHTEXT, fl_tts_msg);
323
324 /* Update file pos percentage in FilePos window */
325 cst_sprintf(rst,"%2.3f",flowm_find_file_percentage());
326 mbstowcs(fl_fp_msg,rst,FL_MAX_MSG_CHARS);
327 SetDlgItemText(TTSWindow, FL_FILEPOS, fl_fp_msg);
328
329 SystemIdleTimerReset(); /* keep alive while synthesizing */
330 if (flowm_play_status == FLOWM_SKIP)
331 flowm_play_status = FLOWM_PLAY;
332 }
333 return u;
334 }
335 else
336 {
337 delete_utterance(u);
338 return 0;
339 }
340 }
341
flowm_audio_stream_chunk(const cst_wave * w,int start,int size,int last,cst_audio_streaming_info * asi)342 int flowm_audio_stream_chunk(const cst_wave *w, int start, int size,
343 int last, cst_audio_streaming_info *asi)
344 {
345
346 if (fl_ad == NULL)
347 {
348 fl_ad = audio_open(w->sample_rate,w->num_channels,CST_AUDIO_LINEAR16);
349 }
350
351 if (flowm_play_status == FLOWM_PLAY)
352 {
353 audio_write(fl_ad,&w->samples[start],size*sizeof(short));
354 return CST_AUDIO_STREAM_CONT;
355 }
356 else if (flowm_play_status == FLOWM_BENCH)
357 { /* Do TTS but don't actually play it */
358 /* How much have we played */
359 flowm_duration += (size*1.0)/w->sample_rate;
360 return CST_AUDIO_STREAM_CONT;
361 }
362 else
363 { /* for STOP, and the SKIPS (if they get here) */
364 return CST_AUDIO_STREAM_STOP;
365 }
366 }
367
368 #ifdef DOTTS
flowm_say_file(TCHAR * tfilename)369 int flowm_say_file(TCHAR *tfilename)
370 {
371 int rc = 0;
372 char *filename;
373 cst_voice *v;
374
375 if (previous_wave)
376 { /* This is really tidy up from Play -- but might say space */
377 delete_wave(previous_wave);
378 previous_wave = NULL;
379 }
380
381 if (fl_ad)
382 {
383 MessageBoxW(0,L"audio fd still open",L"SayFile",0);
384 audio_close(fl_ad);
385 fl_ad = NULL;
386 }
387
388 v = VoxDefs[flowm_selected_voice].v;
389
390 /* Where we want to start from */
391 feat_set_int(v->features, "file_start_position", flowm_file_pos);
392
393 /* Only do print_info in play mode */
394 feat_remove(v->features,"print_info_relation");
395
396 filename = cst_wstr2cstr(tfilename);
397 rc = flite_file_to_speech(filename, v, "stream");
398 cst_free(filename);
399
400 audio_flush(fl_ad);
401 audio_close(fl_ad);
402 fl_ad = NULL;
403
404 return rc;
405
406 }
407 #else
flowm_say_file(TCHAR * text)408 int flowm_say_file(TCHAR *text)
409 {
410 MessageBoxW(0,text,L"SayFile",0);
411 return 0;
412 }
413 #endif
414
415
416
417
418
419