1 /* SpeechSynthesizer.cpp
2 *
3 // * Copyright (C) 2011-2019 David Weenink
4 *
5 * This code is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or (at
8 * your option) any later version.
9 *
10 * This code is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this work. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include "espeak_ng_version.h"
20 #include "espeak_ng.h"
21 #include "espeakdata_FileInMemory.h"
22
23 #include "SpeechSynthesizer.h"
24 #include "Strings_extensions.h"
25 #include "speak_lib.h"
26 #include "encoding.h"
27 #include "string.h"
28 #include "translate.h"
29
30 #include "oo_DESTROY.h"
31 #include "SpeechSynthesizer_def.h"
32 #include "oo_COPY.h"
33 #include "SpeechSynthesizer_def.h"
34 #include "oo_EQUAL.h"
35 #include "SpeechSynthesizer_def.h"
36 #include "oo_CAN_WRITE_AS_ENCODING.h"
37 #include "SpeechSynthesizer_def.h"
38 #include "oo_WRITE_TEXT.h"
39 #include "SpeechSynthesizer_def.h"
40 #include "oo_WRITE_BINARY.h"
41 #include "SpeechSynthesizer_def.h"
42 #include "oo_READ_TEXT.h"
43 #include "SpeechSynthesizer_def.h"
44 #include "oo_READ_BINARY.h"
45 #include "SpeechSynthesizer_def.h"
46 #include "oo_DESCRIPTION.h"
47 #include "SpeechSynthesizer_def.h"
48
49 #define espeak_SAMPLINGFREQUENCY 22050
50
51 extern int option_phoneme_events; // BUG: external declaration outside header file (ppgb 20210307)
52
53 Thing_implement (EspeakVoice, Daata, 0);
54
55 autoEspeakVoice EspeakVoice_create () {
56 try {
57 autoEspeakVoice me = Thing_new (EspeakVoice);
58 my numberOfFormants = 9; // equals N_PEAKS
59 my numberOfKlattParameters = 8;
60 my klattv = zero_INTVEC (my numberOfKlattParameters);
61 my freq = zero_INTVEC (my numberOfFormants);
getSpectralValues(struct tribolet_struct * tbs,double freq_rad,double * xr,double * xi,double * nxr,double * nxi)62 my height = zero_INTVEC (my numberOfFormants); // 100% = 256
63 my width = zero_INTVEC (my numberOfFormants); // 100% = 256
64 my freqadd = zero_INTVEC (my numberOfFormants); // Hz
65
66 // copies without temporary adjustments from embedded commands
67 my freq2 = zero_INTVEC (my numberOfFormants); // 100% = 256
68 my height2 = zero_INTVEC (my numberOfFormants); // 100% = 256
69 my width2 = zero_INTVEC (my numberOfFormants); // 100% = 256
70
71 my breath = zero_INTVEC (my numberOfFormants); // amount of breath for each formant. breath [0] indicates whether any are set.
72 my breathw = zero_INTVEC (my numberOfFormants); // width of each breath formant
73 my numberOfToneAdjusts = 1000; // equals N_TONE_ADJUST in voice.h
74 my tone_adjust = newvectorzero<unsigned char> (my numberOfToneAdjusts);
75 EspeakVoice_setDefaults (me.get());
76 return me;
77 } catch (MelderError) {
78 Melder_throw (U"EspeakVoice not created.");
79 }
80 }
81
82 void EspeakVoice_setDefaults (EspeakVoice me) {
83 (void) me;
84 }
85
86 void EspeakVoice_initFromEspeakVoice (EspeakVoice me, voice_t *voicet) {
87 my v_name = Melder_dup (Melder_peek8to32 (voicet -> v_name));
88
89 my phoneme_tab_ix = voicet -> phoneme_tab_ix;
90 my pitch_base = voicet -> pitch_base;
91 my pitch_range = voicet -> pitch_range;
92
93 my speedf1 = voicet -> speedf1;
94 my speedf2 = voicet -> speedf2;
95 my speedf3 = voicet -> speedf3;
96
97 my speed_percent = voicet -> speed_percent;
98 my flutter = voicet -> flutter;
phase_check(double pv,double * inout_phase,double thlcon)99 my roughness = voicet -> roughness;
100 my echo_delay = voicet -> echo_delay;
101 my echo_amp = voicet -> echo_amp;
102 my n_harmonic_peaks = voicet -> n_harmonic_peaks;
103 my peak_shape = voicet -> peak_shape;
104 my voicing = voicet -> voicing;
105 my formant_factor = voicet -> formant_factor;
106 my consonant_amp = voicet -> consonant_amp;
107 my consonant_ampv = voicet -> consonant_ampv;
108 my samplerate = voicet -> samplerate;
109 my numberOfKlattParameters = 8;
110 for (integer i = 1; i <= my numberOfKlattParameters; i ++)
111 my klattv [i] = voicet -> klattv [i - 1];
112 for (integer i = 1; i <= my numberOfFormants; i ++) {
113 my freq [i] = voicet -> freq [i - 1];
114 my height [i] = voicet -> height [i - 1];
115 my width [i] = voicet -> width [i - 1];
116 my freqadd [i] = voicet -> freqadd [i - 1];
phase_unwrap(struct tribolet_struct * tbs,double pfreq,double ppv,double pdvt,double * pphase,double * ppdvt)117 my freq2 [i] = voicet -> freq2 [i - 1];
118 my height2 [i] = voicet -> height2 [i - 1];
119 my width2 [i] = voicet -> width2 [i - 1];
120 my breath [i] = voicet -> breath [i - 1];
121 my breathw [i] = voicet -> breathw [i - 1];
122 }
123 my numberOfToneAdjusts = 1000;
124 for (integer i = 1; i <= my numberOfToneAdjusts; i ++)
125 my tone_adjust [i] = voicet -> tone_adjust [i - 1];
126 }
127
128 void EspeakVoice_into_voice (EspeakVoice me, voice_t *voicet) { // BUG unused (ppgb 20210307)
129
130 if (my v_name)
131 strncpy (voicet -> v_name, Melder_peek32to8 (my v_name.get()), 40);
132 if (my language_name)
133 strncpy (voicet -> language_name, Melder_peek32to8 (my language_name.get()), 20);
134 voicet -> phoneme_tab_ix = my phoneme_tab_ix;
135 voicet -> pitch_base = my pitch_base;
136 voicet -> pitch_range = my pitch_range;
137
138 voicet -> speedf1 = my speedf1;
139 voicet -> speedf2 = my speedf2;
140 voicet -> speedf3 = my speedf3;
141
142 voicet -> speed_percent = my speed_percent;
143 voicet -> flutter = my flutter;
144 voicet -> roughness = my roughness;
145 voicet -> echo_delay = my echo_delay;
146 voicet -> echo_amp = my echo_amp;
147 voicet -> n_harmonic_peaks = my n_harmonic_peaks;
148 voicet -> peak_shape = my peak_shape;
149 voicet -> voicing = my voicing;
150 voicet -> formant_factor = my formant_factor;
151 voicet -> consonant_amp = my consonant_amp;
152 voicet -> consonant_ampv = my consonant_ampv;
153 voicet -> samplerate = my samplerate;
154 for (integer i = 1; i <= my numberOfKlattParameters; i ++)
155 voicet -> klattv [i - 1] = my klattv [i];
156 for (integer i = 1; i <= my numberOfFormants; i ++) {
157 voicet -> freq [i - 1] = my freq [i];
158 voicet -> height [i - 1] = my height [i];
159 voicet -> width [i - 1] = my width [i];
160 voicet -> freqadd [i - 1] = my freqadd [i];
161 voicet -> freq2 [i - 1] = my freq2 [i];
162 voicet -> height2 [i - 1] = my height2 [i];
163 voicet -> width2 [i - 1] = my width2 [i];
164 voicet -> breath [i - 1] = my breath [i];
165 voicet -> breathw [i - 1] = my breathw [i];
166 }
167 for (integer i = 1; i <= my numberOfToneAdjusts; i ++)
168 voicet -> tone_adjust [i - 1] = voicet -> tone_adjust [i];
169 }
170
171 Thing_implement (SpeechSynthesizer, Daata, 1);
172
173 void structSpeechSynthesizer :: v_info () {
174 our SpeechSynthesizer_Parent :: v_info ();
175 MelderInfo_writeLine (U"Synthesizer version: espeak-ng ", our d_synthesizerVersion.get());
176 MelderInfo_writeLine (U"Language: ", our d_languageName.get());
177 MelderInfo_writeLine (U"Voice: ", our d_voiceName.get());
178 MelderInfo_writeLine (U"Phoneme set: ", our d_phonemeSet.get());
Spectrum_unwrap(Spectrum me)179 MelderInfo_writeLine (U"Input text format: ", (our d_inputTextFormat == SpeechSynthesizer_INPUT_TEXTONLY ? U"text only" :
180 d_inputTextFormat == SpeechSynthesizer_INPUT_PHONEMESONLY ? U"phonemes only" : U"tagged text"));
181 MelderInfo_writeLine (U"Input phoneme coding: ", (our d_inputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_KIRSHENBAUM ? U"Kirshenbaum" : U"???"));
182 MelderInfo_writeLine (U"Sampling frequency: ", our d_samplingFrequency, U" Hz");
183 MelderInfo_writeLine (U"Word gap: ", our d_wordGap, U" s");
184 MelderInfo_writeLine (U"Pitch multiplier: ", our d_pitchAdjustment, U" (0.5-2.0)");
185 MelderInfo_writeLine (U"Pitch range multiplier: ", our d_pitchRange, U" (0.0-2.0)");
186 MelderInfo_writeLine (U"Speaking rate: ", our d_wordsPerMinute, U" words per minute",
187 our d_estimateSpeechRate ? U" (but estimated from speech if possible)" : U" (fixed)");
188 MelderInfo_writeLine (U"Output phoneme coding: ",
189 our d_inputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_KIRSHENBAUM ? U"Kirshenbaum" :
190 our d_inputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_IPA ? U"IPA" : U"???"
191 );
192 }
193
194 static int synthCallback (short *wav, int numsamples, espeak_EVENT *events)
195 {
196 char phoneme_name [9];
197 if (wav == 0) return 1;
198
199 // It is essential that the SpeechSynthesizer is identified here by the user_data,
200 // because the espeakEVENT_LIST_TERMINATED event may still be accompanied by
201 // a piece of audio data!!
202
203 SpeechSynthesizer me = (SpeechSynthesizer) (events -> user_data);
204 while (events -> type != espeakEVENT_LIST_TERMINATED) {
205 if (events -> type == espeakEVENT_SAMPLERATE) {
206 my d_internalSamplingFrequency = events -> id.number;
207 } else {
208 //my events = Table "time type type-t t-pos length a-pos sample id uniq";
209 // 1 2 3 4 5 6 7 8 9
210 Table_appendRow (my d_events.get());
211 const integer irow = my d_events -> rows.size;
212 const double time = events -> audio_position * 0.001;
213 Table_setNumericValue (my d_events.get(), irow, 1, time);
214 Table_setNumericValue (my d_events.get(), irow, 2, events -> type);
215 // Column 3 will be filled afterwards
216 Table_setNumericValue (my d_events.get(), irow, 4, events -> text_position);
217 Table_setNumericValue (my d_events.get(), irow, 5, events -> length);
218 Table_setNumericValue (my d_events.get(), irow, 6, events -> audio_position);
219 Table_setNumericValue (my d_events.get(), irow, 7, events -> sample);
220 if (events -> type == espeakEVENT_MARK || events -> type == espeakEVENT_PLAY) {
221 Table_setStringValue (my d_events.get(), irow, 8, Melder_peek8to32 (events -> id.name));
222 } else {
223 // Ugly hack because id.string is not 0-terminated if 8 chars long!
224 memcpy (phoneme_name, events -> id.string, 8);
225 phoneme_name [8] = 0;
226 Table_setStringValue (my d_events.get(), irow, 8, Melder_peek8to32 (phoneme_name));
227 }
228 Table_setNumericValue (my d_events.get(), irow, 9, events -> unique_identifier);
229 }
230 events++;
231 }
232 if (me) {
233 my d_wav.resize (my d_numberOfSamples + numsamples);
234 for (integer i = 1; i <= numsamples; i++)
235 my d_wav [my d_numberOfSamples + i] = wav [i - 1];
236 my d_numberOfSamples += numsamples;
237 }
238 return 0;
239 }
240
241 static conststring32 SpeechSynthesizer_getLanguageCode (SpeechSynthesizer me) {
242 try {
243 const integer irow = Table_searchColumn (espeakdata_languages_propertiesTable.get(), 2, my d_languageName.get());
244 Melder_require (irow != 0,
245 U"Cannot find language \"", my d_languageName.get(), U"\".");
246 return Table_getStringValue_Assert (espeakdata_languages_propertiesTable.get(), irow, 1);
247 } catch (MelderError) {
248 Melder_throw (me, U": Cannot find language code.");
249 }
250 }
251
Spectrum_drawPhases(Spectrum me,Graphics g,double fmin,double fmax,double phase_min,double phase_max,int unwrap,bool)252 static conststring32 SpeechSynthesizer_getPhonemeCode (SpeechSynthesizer me) {
253 try {
254 const integer irow = Table_searchColumn (espeakdata_languages_propertiesTable.get(), 2, my d_phonemeSet.get());
255 Melder_require (irow != 0,
256 U"Cannot find phoneme set \"", my d_phonemeSet.get(), U"\".");
257 return Table_getStringValue_Assert (espeakdata_languages_propertiesTable.get(), irow, 1);
258 } catch (MelderError) {
259 Melder_throw (me, U": Cannot find phoneme code.");
260 }
261 }
262
263 static conststring32 SpeechSynthesizer_getVoiceCode (SpeechSynthesizer me) {
264 try {
265 const integer irow = Table_searchColumn (espeakdata_voices_propertiesTable.get(), 2, my d_voiceName.get());
266 Melder_require (irow != 0,
Spectra_multiply(Spectrum me,Spectrum thee)267 U": Cannot find voice variant \"", my d_voiceName.get(), U"\".");
268 return Table_getStringValue_Assert (espeakdata_voices_propertiesTable.get(), irow, 1);
269 } catch (MelderError) {
270 Melder_throw (me, U": Cannot find voice code.");
271 }
272 }
273
274 autoSpeechSynthesizer SpeechSynthesizer_create (conststring32 languageName, conststring32 voiceName) {
275 try {
276 autoSpeechSynthesizer me = Thing_new (SpeechSynthesizer);
277 my d_synthesizerVersion = Melder_dup (ESPEAK_NG_VERSION);
278 my d_languageName = Melder_dup (languageName);
279 (void) SpeechSynthesizer_getLanguageCode (me.get()); // existence check
280 my d_voiceName = Melder_dup (voiceName);
281 (void) SpeechSynthesizer_getVoiceCode (me.get()); // existence check
282 my d_phonemeSet = Melder_dup (languageName);
283 SpeechSynthesizer_setTextInputSettings (me.get(), SpeechSynthesizer_INPUT_TEXTONLY, SpeechSynthesizer_PHONEMECODINGS_KIRSHENBAUM);
Spectrum_shiftPhaseBy90Degrees(Spectrum me)284 SpeechSynthesizer_setSpeechOutputSettings (me.get(), 44100.0, 0.01, 1.0, 1.0, 175.0, SpeechSynthesizer_PHONEMECODINGS_IPA);
285 SpeechSynthesizer_setEstimateSpeechRateFromSpeech (me.get(), true);
286 return me;
287 } catch (MelderError) {
288 Melder_throw (U"SpeechSynthesizer not created.");
289 }
290 }
291
Spectrum_unshiftPhaseBy90Degrees(Spectrum me)292 void SpeechSynthesizer_setTextInputSettings (SpeechSynthesizer me, int inputTextFormat, int inputPhonemeCoding) {
293 my d_inputTextFormat = inputTextFormat;
294 my d_inputPhonemeCoding = inputPhonemeCoding;
295 }
296
297 void SpeechSynthesizer_setEstimateSpeechRateFromSpeech (SpeechSynthesizer me, bool estimate) {
298 my d_estimateSpeechRate = estimate;
299 }
Spectrum_conjugate(Spectrum me)300
301 void SpeechSynthesizer_setSpeechOutputSettings (SpeechSynthesizer me, double samplingFrequency, double wordGap, double pitchAdjustment, double pitchRange, double wordsPerMinute, int outputPhonemeCoding) {
302 my d_samplingFrequency = samplingFrequency;
303 my d_wordGap = wordGap;
304 my d_pitchAdjustment = Melder_clipped (0.5, pitchAdjustment, 2.0);
305 my d_pitchRange = Melder_clipped (0.0, pitchRange, 2.0);
306 if (wordsPerMinute <= 0.0)
307 wordsPerMinute = 175.0; // SMELL: looks like an arbitrary default
308 my d_wordsPerMinute = Melder_clipped (80.0, wordsPerMinute, 450.0);
309 my d_outputPhonemeCoding = outputPhonemeCoding;
310 }
311
312 void SpeechSynthesizer_playText (SpeechSynthesizer me, conststring32 text) {
313 autoSound thee = SpeechSynthesizer_to_Sound (me, text, nullptr, nullptr);
314 Sound_play (thee.get(), nullptr, nullptr);
315 }
316
317 static autoSound buffer_to_Sound (constINTVEC const& wav, double samplingFrequency)
318 {
319 try {
320 const double dx = 1.0 / samplingFrequency;
321 const double xmax = wav.size * dx;
322 autoSound thee = Sound_create (1, 0.0, xmax, wav.size, dx, dx / 2.0);
323 for (integer i = 1; i <= wav.size; i++)
324 thy z [1] [i] = wav [i] / 32768.0;
325 return thee;
326 } catch (MelderError) {
327 Melder_throw (U"Sound not created from synthesizer data.");
328 }
329 }
330
331 static void IntervalTier_addBoundaryUnsorted (IntervalTier me, integer iinterval, double time, conststring32 newLabel, bool isNewleftLabel) {
332 Melder_require (time > my xmin && time < my xmax,
333 U"Time is outside interval domains.");
334 /*
335 Find interval to split
336 */
337 if (iinterval <= 0)
338 iinterval = IntervalTier_timeToLowIndex (me, time);
339 /*
340 Modify end time of left label
341 */
342 const TextInterval ti = my intervals.at [iinterval];
343 ti -> xmax = time;
344 if (isNewleftLabel)
345 TextInterval_setText (ti, newLabel);
346 autoTextInterval ti_new = TextInterval_create (time, my xmax, (! isNewleftLabel ? newLabel : U"" ));
347 my intervals. addItem_unsorted_move (ti_new.move());
348 }
349
350 static void Table_setEventTypeString (Table me) {
351 try {
352 for (integer i = 1; i <= my rows.size; i ++) {
353 const int type = Table_getNumericValue_Assert (me, i, 2);
354 conststring32 label = U"0";
355 if (type == espeakEVENT_WORD)
356 label = U"word";
357 else if (type == espeakEVENT_SENTENCE)
358 label = U"sent";
359 else if (type == espeakEVENT_MARK)
Spectrum_shiftFrequencies(Spectrum me,double shiftBy,double newMaximumFrequency,integer interpolationDepth)360 label = U"mark";
361 else if (type == espeakEVENT_PLAY)
362 label = U"play";
363 else if (type == espeakEVENT_END)
364 label = U"s-end";
365 else if (type == espeakEVENT_MSG_TERMINATED)
366 label = U"msg_term";
367 else if (type == espeakEVENT_PHONEME)
368 label = U"phoneme";
369 Table_setStringValue (me, i, 3, label);
370 }
371 } catch (MelderError) {
372 Melder_throw (U"Event types not set.");
373 }
374 }
375
376 static void MelderString_trimWhiteSpaceAtEnd (MelderString *me) {
377 while (my length > 1 && (my string [my length - 1] == U' ' || my string [my length - 1] == U'\t'
378 || my string [my length - 1] == U'\r' || my string [my length - 1] == U'\n'))
379 my string [-- my length] = U'\0';
380 }
381
382 static void IntervalTier_mergeSpecialIntervals (IntervalTier me) {
383 integer intervalIndex = my intervals.size;
384 TextInterval right = my intervals.at [intervalIndex];
385 const integer labelLength_right = TextInterval_labelLength (right);
386 bool isEmptyInterval_right = labelLength_right == 0 || (labelLength_right == 1 && Melder_equ (right -> text.get(), U"\001"));
387 while (intervalIndex > 1) {
388 const TextInterval left = my intervals.at [intervalIndex - 1];
389 const integer labelLength_left = TextInterval_labelLength (left);
390 const bool isEmptyInterval_left = labelLength_left == 0 || (labelLength_left == 1 && Melder_equ (left -> text.get(), U"\001"));
391 if (isEmptyInterval_right && isEmptyInterval_left) {
Spectrum_compressFrequencyDomain(Spectrum me,double fmax,integer interpolationDepth,int freqscale,int method)392 /*
393 Remove right interval and empty left interval
394 */
395 left -> xmax = right -> xmax;
396 TextInterval_setText (left, U"");
397 my intervals. removeItem (intervalIndex);
398 }
399 right = left;
400 isEmptyInterval_right = isEmptyInterval_left;
401 intervalIndex --;
402 }
403 }
404
405 #if 0 // BUG unused (ppgb 20210307)
406 /* insert boundary at time t and merge/delete intervals after this time */
407 static void IntervalTier_insertBoundaryAndMergeIntervalsAfter (IntervalTier me, double t) {
408 if (t <= my xmin || t >= my xmax)
409 return;
410
411 const integer intervalNumber = IntervalTier_timeToLowIndex (me, t);
412 while (my intervals.size > intervalNumber + 1)
413 my intervals. removeItem (my intervals.size);
414
415 // there can be maximally one interval left to the right of intervalNumber
416 const TextInterval ti = my intervals.at [intervalNumber];
417 if (ti -> xmin == t) { // if t happens to be on a boundary: remove the next interval if it exists
418 if (my intervals.size > intervalNumber)
419 my intervals. removeItem (my intervals .size);
420 ti -> xmax = my xmax;
421 TextInterval_setText (ti, U"");
422 } else {
423 ti -> xmax = t;
424 TextInterval last = my intervals.at [my intervals.size];
425 last -> xmin = t;
426 last -> xmax = my xmax;
427 TextInterval_setText (last, U"");
428 }
429 }
430 #endif
431
432 static bool almost_equal (double t1, double t2) {
433 // the "=" sign is essential for a difference of zero if t1 == 0
434 return fabs (t1 - t2) <= 1e-12 * fabs (t1);
435 }
436
437 static void IntervalTier_insertEmptyIntervalsFromOtherTier (IntervalTier to, IntervalTier from) {
438 for (integer iint = 1; iint <= from -> intervals.size; iint ++) {
439 const TextInterval tifrom = from -> intervals.at [iint];
440 if (TextInterval_labelLength (tifrom) == 0) { // found empty interval
441 const double t_left = tifrom -> xmin, t_right = tifrom -> xmax;
442 integer intervalIndex_to = IntervalTier_timeToLowIndex (to, t_left);
443 if (intervalIndex_to > 0) { // insert to the right of intervalIndex_to
444 const TextInterval tito = to -> intervals.at [intervalIndex_to];
445 if (! almost_equal (tito -> xmin, t_left)) { // not on the start boundary of the interval, it cannot be at xmax
446 autoTextInterval newInterval = TextInterval_create (t_left, tito -> xmax, U"");
447 tito -> xmax = t_left;
448 to -> intervals. addItem_move (newInterval.move());
449 }
450 }
451 intervalIndex_to = IntervalTier_timeToHighIndex (to, t_right);
452 const TextInterval tito = to -> intervals.at [intervalIndex_to];
453 if (intervalIndex_to > 0) {
454 if (! almost_equal (t_right, tito -> xmax)) { // insert to the left of intervalIndex_to
455 autoTextInterval newInterval = TextInterval_create (tito -> xmin, t_right, U"");
456 tito -> xmin = t_right;
457 to -> intervals. addItem_move (newInterval.move());
458 }
459 }
460 }
461 }
462 }
463
464 static void IntervalTier_removeVeryShortIntervals (IntervalTier me) {
465 integer iint = 1;
466 while (iint <= my intervals.size) {
467 const TextInterval ti = my intervals.at [iint];
468 if (almost_equal (ti -> xmin, ti -> xmax))
469 my intervals.removeItem (iint);
470 else
471 iint ++;
472 }
473 }
474
475 static autoTextGrid Table_to_TextGrid (Table me, conststring32 text, double xmin, double xmax) {
476 //Table_createWithColumnNames (0, L"time type type-t t-pos length a-pos sample id uniq");
477 try {
478 const integer textLength = str32len (text);
479 const integer numberOfRows = my rows.size;
480 const integer timeColumnIndex = Table_getColumnIndexFromColumnLabel (me, U"time");
481 const integer typeColumnIndex = Table_getColumnIndexFromColumnLabel (me, U"type");
482 const integer tposColumnIndex = Table_getColumnIndexFromColumnLabel (me, U"t-pos");
483 const integer idColumnIndex = Table_getColumnIndexFromColumnLabel (me, U"id");
484 autoTextGrid thee = TextGrid_create (xmin, xmax, U"sentence clause word phoneme", U"");
485
486 TextGrid_setIntervalText (thee.get(), 1, 1, text);
487
488 integer p1c = 1, p1w = 1;
489 double time_phon_p = xmin;
490 bool wordEnd = false;
491 autoMelderString mark;
492
493 const IntervalTier clauses = (IntervalTier) thy tiers->at [2];
494 const IntervalTier words = (IntervalTier) thy tiers->at [3];
495 const IntervalTier phonemes = (IntervalTier) thy tiers->at [4];
496 for (integer i = 1; i <= numberOfRows; i++) {
497 const double time = Table_getNumericValue_Assert (me, i, timeColumnIndex);
498 const int type = Table_getNumericValue_Assert (me, i, typeColumnIndex);
499 const integer pos = Table_getNumericValue_Assert (me, i, tposColumnIndex);
500 integer length;
501 if (type == espeakEVENT_SENTENCE) {
502 /*
503 Only insert a new boundary, no text
504 text will be inserted at end sentence event
505 */
506 if (time > xmin && time < xmax)
507 IntervalTier_addBoundaryUnsorted (clauses, clauses -> intervals.size, time, U"", true);
508 p1c = pos;
509 } else if (type == espeakEVENT_END) {
510 /*
511 End of clause: insert new boundary, and fill left interval with text
512 */
513 length = pos - p1c + 1;
514 MelderString_ncopy (&mark, text + p1c - 1, length);
515 MelderString_trimWhiteSpaceAtEnd (& mark);
516 if (time > xmin && time < xmax)
517 IntervalTier_addBoundaryUnsorted (clauses, clauses -> intervals.size, time, mark.string, true);
518 else
519 TextGrid_setIntervalText (thee.get(), 2, clauses -> intervals.size, mark.string);
520 p1c = pos;
521 /*
522 End of clause always signals "end of a word"
523 */
524 if (pos <= textLength) {
525 length = pos - p1w + 1;
526 MelderString_ncopy (&mark, text + p1w - 1, length);
527 MelderString_trimWhiteSpaceAtEnd (& mark);
528 if (time > xmin && time < xmax)
529 IntervalTier_addBoundaryUnsorted (words, words -> intervals.size, time, mark.string, true);
530 else
531 TextGrid_setIntervalText (thee.get(), 3, words -> intervals.size, mark.string);
532 /*
533 Now the next word event should not trigger setting the left interval text
534 */
535 wordEnd = false;
536 }
537 } else if (type == espeakEVENT_WORD) {
538 if (pos < p1w)
539 continue;
540 if (time > xmin && time < xmax) {
541 length = pos - p1w;
542 if (pos == textLength)
543 length++;
544 MelderString_ncopy (&mark, text + p1w - 1, length);
545 MelderString_trimWhiteSpaceAtEnd (& mark);
546 IntervalTier_addBoundaryUnsorted (words, words -> intervals.size, time, ( wordEnd ? mark.string : U"" ), true);
547 }
548 wordEnd = true;
549 p1w = pos;
550 } else if (type == espeakEVENT_PHONEME) {
551 const conststring32 id = Table_getStringValue_Assert (me, i, idColumnIndex);
552 if (time > time_phon_p) {
553 /*
554 Insert new boudary and label interval with the id
555 TODO: Translate the id to the correct notation
556 */
557 TextInterval ti = phonemes -> intervals.at [phonemes -> intervals.size];
558 if (time > ti -> xmin && time < ti -> xmax)
559 IntervalTier_addBoundaryUnsorted (phonemes, phonemes -> intervals.size, time, id, false);
560 } else {
561 /*
562 Just in case the phoneme starts at xmin we only need to set interval text
563 */
564 TextGrid_setIntervalText (thee.get(), 4, phonemes -> intervals.size, id);
565 }
566 time_phon_p = time;
567 }
568 }
569 clauses -> intervals. sort ();
570 words -> intervals. sort ();
571 phonemes -> intervals. sort ();
572
573 IntervalTier_mergeSpecialIntervals (phonemes); // Merge neighbouring empty U"" and U"\001" intervals
574
575 IntervalTier_removeVeryShortIntervals (words);
576 IntervalTier_removeVeryShortIntervals (clauses);
577 /*
578 Use empty intervals in phoneme tier for more precision in the word tier
579 */
580 IntervalTier_insertEmptyIntervalsFromOtherTier (words, phonemes);
581 IntervalTier_mergeSpecialIntervals (words); // Merge neighbouring empty U"" and U"\001" intervals
582
583 return thee;
584 } catch (MelderError) {
585 Melder_throw (U"TextGrid not created from Table with events.");
586 }
587 }
588
589 #if 0 // BUG unused (ppgb 20210307)
590
591 static void espeakdata_SetVoiceByName (conststring32 languageName, conststring32 voiceName) {
592 espeak_VOICE voice_selector;
593
594 memset (& voice_selector, 0, sizeof voice_selector);
595 voice_selector.name = Melder_peek32to8 (Melder_cat (languageName, U"+", voiceName)); // include variant name in voice stack ??
596
597 if (LoadVoice (Melder_peek32to8 (languageName), 1)) {
598 LoadVoice (Melder_peek32to8 (voiceName), 2);
599 DoVoiceChange (voice);
600 SetVoiceStack (& voice_selector, Melder_peek32to8 (voiceName));
601 }
602 }
603 #endif
604
605 autoSound SpeechSynthesizer_to_Sound (SpeechSynthesizer me, conststring32 text, autoTextGrid *tg, autoTable *events) {
606 try {
607 espeak_ng_InitializePath (nullptr); // PATH_ESPEAK_DATA
608 espeak_ng_ERROR_CONTEXT context = { 0 };
609 espeak_ng_STATUS status = espeak_ng_Initialize (& context);
610 Melder_require (status == ENS_OK,
611 U"Internal espeak error.", status);
612 int synth_flags = espeakCHARS_WCHAR;
613 if (my d_inputTextFormat == SpeechSynthesizer_INPUT_TAGGEDTEXT)
614 synth_flags |= espeakSSML;
615 if (my d_inputTextFormat != SpeechSynthesizer_INPUT_TEXTONLY)
616 synth_flags |= espeakPHONEMES;
617 option_phoneme_events = espeakINITIALIZE_PHONEME_EVENTS; // extern int option_phoneme_events;
618 if (my d_outputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_IPA)
619 option_phoneme_events |= espeakINITIALIZE_PHONEME_IPA;
620
621 espeak_ng_SetParameter (espeakRATE, my d_wordsPerMinute, 0);
622 /*
623 pitchAdjustment_0_99 = a * log10 (my d_pitchAdjustment) + b,
624 where 0.5 <= my d_pitchAdjustment <= 2
625 pitchRange_0_99 = my d_pitchRange * 49.5,
626 where 0 <= my d_pitchRange <= 2
627 */
628 const int pitchAdjustment_0_99 = (int) ((49.5 / NUMlog10_2) * log10 (my d_pitchAdjustment) + 49.5); // rounded towards zero
629 espeak_ng_SetParameter (espeakPITCH, pitchAdjustment_0_99, 0);
630 const int pitchRange_0_99 = (int) (my d_pitchRange * 49.5); // rounded towards zero
631 espeak_ng_SetParameter (espeakRANGE, pitchRange_0_99, 0);
632 const conststring32 languageCode = SpeechSynthesizer_getLanguageCode (me);
633 const conststring32 voiceCode = SpeechSynthesizer_getVoiceCode (me);
634
635 espeak_ng_SetVoiceByName (Melder_peek32to8 (Melder_cat (languageCode, U"+", voiceCode)));
636 const int wordGap_10ms = my d_wordGap * 100; // espeak word gap is in units of 10 ms
637 espeak_ng_SetParameter (espeakWORDGAP, wordGap_10ms, 0);
638 espeak_ng_SetParameter (espeakCAPITALS, 0, 0);
639 espeak_ng_SetParameter (espeakPUNCTUATION, espeakPUNCT_NONE, 0);
640
641 status = espeak_ng_InitializeOutput (ENOUTPUT_MODE_SYNCHRONOUS, 2048, nullptr);
642 espeak_SetSynthCallback (synthCallback);
643 if (! Melder_equ (my d_phonemeSet.get(), my d_languageName.get())) {
644 const conststring32 phonemeCode = SpeechSynthesizer_getPhonemeCode (me);
645 const int index_phon_table_list = LookupPhonemeTable (Melder_peek32to8 (phonemeCode));
646 if (index_phon_table_list > 0) {
647 voice -> phoneme_tab_ix = index_phon_table_list;
648 DoVoiceChange(voice);
649 }
650 }
651
652 const conststring32 columnNames [] =
653 { U"time", U"type", U"type-t", U"t-pos", U"length", U"a-pos", U"sample", U"id", U"uniq" };
654 my d_events = Table_createWithColumnNames (0, ARRAY_TO_STRVEC (columnNames));
655
656 #ifdef _WIN32
657 conststringW textW = Melder_peek32toW (text);
658 espeak_ng_Synthesize (textW, wcslen (textW) + 1, 0, POS_CHARACTER, 0, synth_flags, nullptr, me);
659 #else
660 espeak_ng_Synthesize (text, str32len (text) + 1, 0, POS_CHARACTER, 0, synth_flags, nullptr, me);
661 #endif
662
663 espeak_ng_Terminate ();
664 autoSound thee = buffer_to_Sound (my d_wav.get(), my d_internalSamplingFrequency);
665
666 if (my d_samplingFrequency != my d_internalSamplingFrequency)
667 thee = Sound_resample (thee.get(), my d_samplingFrequency, 50);
668 my d_numberOfSamples = 0; // re-use the wav-buffer
669 if (tg) {
670 double xmin = Table_getNumericValue_Assert (my d_events.get(), 1, 1);
671 if (xmin > thy xmin)
672 xmin = thy xmin;
673 double xmax = Table_getNumericValue_Assert (my d_events.get(), my d_events -> rows.size, 1);
674 if (xmax < thy xmax)
675 xmax = thy xmax;
676 autoTextGrid tg1 = Table_to_TextGrid (my d_events.get(), text, xmin, xmax);
677 *tg = TextGrid_extractPart (tg1.get(), thy xmin, thy xmax, 0);
678 }
679 if (events) {
680 Table_setEventTypeString (my d_events.get());
681 *events = my d_events.move();
682 }
683 my d_events.reset();
684 return thee;
685 } catch (MelderError) {
686 espeak_Terminate ();
687 Melder_throw (U"SpeechSynthesizer: text not converted to Sound.");
688 }
689 }
690
691 /* End of file SpeechSynthesizer.cpp */
692