1 /* SpeechSynthesizer_and_TextGrid.cpp
2 *
3 * Copyright (C) 2011-2019 David Weenink
4 *
5 * This code is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or (at
8 * your option) any later version.
9 *
10 * This code is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this work. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 /*
20 djmw 20111214
21 */
22
23 #include "DTW.h"
24 #include "Sounds_to_DTW.h"
25 #include "Sound_extensions.h"
26 #include "SpeechSynthesizer_and_TextGrid.h"
27 #include "CCs_to_DTW.h"
28 #include "DTW_and_TextGrid.h"
29 #include "NUMmachar.h"
30
31 // prototypes
32 static void IntervalTier_splitInterval (IntervalTier me, double time, conststring32 leftLabel, integer interval, double precision);
33 static autoIntervalTier IntervalTier_IntervalTier_cutPartsMatchingLabel (IntervalTier me, IntervalTier thee, conststring32 label, double precision);
34 static autoIntervalTier IntervalTiers_patch_noBoundaries (IntervalTier me, IntervalTier thee, conststring32 patchLabel, double precision);
35 static autoTable IntervalTiers_to_Table_textAlignmentment (IntervalTier target, IntervalTier source, EditCostsTable costs);
36
37
38 static void IntervalTier_checkRange (IntervalTier me, integer startInterval, integer endinterval) {
39 Melder_require (startInterval <= endinterval,
40 U"The interval range end number should not be smaller than the interval range start number.");
41 Melder_require (startInterval > 0,
42 U"The specified interval range start number is ", startInterval, U", but should be at least 1.");
43 Melder_require (endinterval <= my intervals.size,
44 U"The specified interval range end number (", endinterval, U") exceeds the number of intervals (", my intervals.size, U") in this tier.");
45 }
46
47 autoSound SpeechSynthesizer_TextInterval_to_Sound (SpeechSynthesizer me, TextInterval thee, autoTextGrid *out_textgrid)
48 {
49 try {
50 Melder_require (thy text && thy text [0] != U'\0',
51 U"TextInterval should not be empty.");
52 autoSound him = SpeechSynthesizer_to_Sound (me, thy text.get(), out_textgrid, nullptr);
53 return him;
54 } catch (MelderError) {
EspeakVoice_create()55 Melder_throw (U"Sound not created from TextInterval.");
56 }
57 }
58
59 autoSound SpeechSynthesizer_TextGrid_to_Sound (SpeechSynthesizer me, TextGrid thee, integer tierNumber, integer iinterval, autoTextGrid *out_textgrid) {
60 try {
61 TextGrid_checkSpecifiedTierNumberWithinRange (thee, tierNumber);
62 const IntervalTier intervalTier = (IntervalTier) thy tiers->at [tierNumber];
63 Melder_require (intervalTier -> classInfo == classIntervalTier,
64 U"Tier ", tierNumber, U" is not an interval tier.");
65 Melder_require (iinterval > 0 && iinterval <= intervalTier -> intervals.size,
66 U"Interval ", iinterval, U" does not exist on tier ", tierNumber, U".");
67 return SpeechSynthesizer_TextInterval_to_Sound (me, intervalTier -> intervals.at [iinterval], out_textgrid);
68 } catch (MelderError) {
69 Melder_throw (U"Sound not created from textGrid.");
70 }
71 }
72
73 #if 0
74 static double TextGrid_getStartTimeOfFirstOccurrence (TextGrid thee, integer tierNumber, conststring32 label) {
75 TextGrid_checkSpecifiedTierNumberWithinRange (thee, tierNumber);
76 const IntervalTier intervalTier = (IntervalTier) thy tiers->at [tierNumber];
77 Melder_require (intervalTier -> classInfo == classIntervalTier,
78 U"Tier ", tierNumber, U" is not an interval tier.");
79 double start = undefined;
80 for (integer iint = 1; iint <= intervalTier -> intervals.size; iint ++) {
81 const TextInterval ti = intervalTier -> intervals.at [iint];
EspeakVoice_setDefaults(EspeakVoice me)82 if (Melder_cmp (ti -> text, label) == 0) {
83 start = ti -> xmin;
84 break;
85 }
EspeakVoice_initFromEspeakVoice(EspeakVoice me,voice_t * voicet)86 }
87 return start;
88 }
89
90 static double TextGrid_getEndTimeOfLastOccurrence (TextGrid thee, integer tierNumber, conststring32 label) {
91 TextGrid_checkSpecifiedTierNumberWithinRange (thee, tierNumber);
92 IntervalTier intervalTier = (IntervalTier) thy tiers->at [tierNumber];
93 Melder_require (intervalTier -> classInfo == classIntervalTier,
94 U"Tier ", tierNumber, U" is not an interval tier.");
95 double end = undefined;
96 for (integer iint = intervalTier -> intervals.size; iint > 0; iint --) {
97 const TextInterval ti = intervalTier -> intervals.at [iint];
98 if (Melder_equ (ti -> text, label)) {
99 end = ti -> xmax;
100 break;
101 }
102 }
103 return end;
104 }
105 #endif
106
107 static void IntervalTier_getLabelInfo (IntervalTier me, conststring32 label, double *labelDurations, integer *numberOfOccurences) {
108 *labelDurations = 0.0;
109 *numberOfOccurences = 0;
110 for (integer i = 1; i <= my intervals.size; i ++) {
111 const TextInterval ti = my intervals.at [i];
112 if (Melder_equ (ti -> text.get(), label)) {
113 *labelDurations += ti -> xmax - ti -> xmin;
114 (*numberOfOccurences) ++;
115 }
116 }
117 }
118
119 #define TIMES_ARE_CLOSE(x,y) (fabs((x)-(y)) < precision)
120 void IntervalTier_splitInterval (IntervalTier me, double time, conststring32 leftLabel, integer interval, double precision) {
121 try {
122 Melder_assert (interval > 0);
123 TextInterval ti = nullptr;
124 integer index = 0;
125 for (integer i = interval; i <= my intervals.size; i ++) {
126 ti = my intervals.at [i];
127 if (time < ti -> xmax + precision && time > ti -> xmin - precision) {
EspeakVoice_into_voice(EspeakVoice me,voice_t * voicet)128 index = i;
129 break;
130 }
131 }
132 // if index == 0 then search left intervals??
133 if (index == 0 || TIMES_ARE_CLOSE(time, ti -> xmin) || TIMES_ARE_CLOSE(time, ti -> xmax))
134 return;
135 autoTextInterval newInterval = TextInterval_create (ti -> xmin, time, leftLabel);
136 /*
137 Make start of current and begin of new interval equal
138 */
139 ti -> xmin = time;
140 my intervals. addItem_move (newInterval.move());
141 } catch (MelderError) {
142 Melder_throw (U"Boundary not inserted.");
143 }
144
145 }
146
147 static autoTextTier TextTier_IntervalTier_cutPartsMatchingLabel (TextTier me, IntervalTier thee, conststring32 label, double precision) {
148 try {
149 Melder_require (my xmin == thy xmin && my xmax == thy xmax,
150 U"Domains should be equal.");
151 integer myIndex = 1;
152 double timeCut = 0.0;
153 autoTextTier him = TextTier_create (0.0, my xmax - my xmin);
154 for (integer j = 1; j <= thy intervals.size; j ++) {
155 const TextInterval cut = thy intervals.at [j];
156 if (Melder_equ (cut -> text.get(), label))
157 timeCut += cut -> xmax - cut -> xmin;
158 else {
159 while (myIndex <= my points.size) {
160 const TextPoint tp = my points.at [myIndex];
161 if (tp -> number < cut -> xmin - precision) {
162 // point is left of cut
163 myIndex ++;
164 } else if (tp -> number < cut -> xmax + precision) {
165 // point is in (no)cut
166 const double time = tp -> number - my xmin - timeCut;
167 TextTier_addPoint (him.get(), time, tp -> mark.get());
168 myIndex ++;
169 } else {
170 break;
171 }
172 }
v_info()173 }
174 }
175 his xmax -= timeCut;
176 return him;
177 } catch (MelderError) {
178 Melder_throw (me, U": parts not cut.");
179 }
180 }
181
182 // Cut parts from me marked by labels in thee
183 autoIntervalTier IntervalTier_IntervalTier_cutPartsMatchingLabel (IntervalTier me, IntervalTier thee, conststring32 label, double precision) {
184 try {
185 Melder_require (my xmin == thy xmin && my xmax != thy xmax,
186 U"Domains should be identical.");
187 autoVEC durations = raw_VEC (my intervals.size);
188 for (integer i = 1; i <= my intervals.size; i ++) {
189 const TextInterval ti = my intervals.at [i];
190 durations [i] = ti -> xmax - ti -> xmin;
191 }
192 integer myInterval = 1;
193 for (integer j = 1; j <= thy intervals.size; j ++) {
synthCallback(short * wav,int numsamples,espeak_EVENT * events)194 const TextInterval cut = thy intervals.at [j];
195 if (Melder_equ (cut -> text.get(), label)) { // trim
196 while (myInterval <= my intervals.size) {
197 const TextInterval ti = my intervals.at [myInterval];
198 if (ti -> xmin > cut -> xmin - precision && ti -> xmax < cut -> xmax + precision) {
199 /*
200 1. Interval completely within cut
201 */
202 durations [myInterval] = 0.0;
203 myInterval ++;
204 } else if (ti -> xmin < cut -> xmin + precision && cut -> xmin < ti -> xmax + precision) {
205 /*
206 2. Cut start is within interval
207 */
208 if (cut -> xmax > ti -> xmax - precision) {
209 /*
210 Interval end is in cut, interval start before
211 */
212 durations [myInterval] -= ti -> xmax - cut -> xmin;
213 myInterval ++;
214 } else {
215 /*
216 3. cut completely within interval
217 */
218 durations [myInterval] -= cut -> xmax - cut -> xmin;
219 break;
220 }
221 } else if (cut -> xmax > ti -> xmin - precision && cut -> xmin < ti -> xmax + precision) {
222 /*
223 1+2 : cut end is within interval, cut start before
224 */
225 durations [myInterval] -= cut -> xmax - ti -> xmin;
226 break;
227 } else if (ti -> xmax < cut -> xmin + precision) {
228 myInterval ++;
229 }
230 }
231 }
232 }
233 longdouble totalDuration = 0.0;
234 for (integer i = 1; i <= my intervals.size; i ++) {
235 if (durations [i] < precision)
236 durations [i] = 0.0;
237 totalDuration += durations [i];
238 }
239 autoIntervalTier him = IntervalTier_create (0, double (totalDuration));
240 double time = 0.0;
SpeechSynthesizer_getLanguageCode(SpeechSynthesizer me)241 integer hisInterval = 1;
242 for (integer i = 1; i <= my intervals.size; i ++) {
243 if (durations [i] <= 0.0)
244 continue;
245 const TextInterval ti = my intervals.at [i];
246 time += durations [i];
247 if (fabs (time - totalDuration) > precision) {
248 IntervalTier_splitInterval (him.get(), time, ti -> text.get(), hisInterval, precision);
249 hisInterval ++;
250 } else { // last interval
251 const TextInterval histi = his intervals.at [hisInterval];
SpeechSynthesizer_getPhonemeCode(SpeechSynthesizer me)252 TextInterval_setText (histi, ti -> text.get());
253 }
254 }
255 return him;
256 } catch (MelderError) {
257 Melder_throw (me, U": parts not cut.");
258 }
259 }
260
261 autoTextGrid TextGrid_IntervalTier_cutPartsMatchingLabel (TextGrid me, IntervalTier thee, conststring32 label, double precision) {
262 try {
SpeechSynthesizer_getVoiceCode(SpeechSynthesizer me)263 Melder_require (my xmin == thy xmin && my xmax == thy xmax,
264 U"Domains should be equal.");
265 double cutDurations = 0;
266 for (integer i = 1; i <= thy intervals.size; i ++) {
267 const TextInterval cut = thy intervals.at [i];
268 if (Melder_equ (cut -> text.get(), label))
269 cutDurations += cut -> xmax - cut -> xmin;
270 }
271 if (cutDurations <= precision) // Nothing to patch
272 return Data_copy (me);
273 autoTextGrid him = TextGrid_createWithoutTiers (0, thy xmax - thy xmin - cutDurations);
SpeechSynthesizer_create(conststring32 languageName,conststring32 voiceName)274 for (integer itier = 1; itier <= my tiers->size; itier ++) {
275 const Function anyTier = my tiers->at [itier];
276 if (anyTier -> classInfo == classIntervalTier) {
277 autoIntervalTier newTier = IntervalTier_IntervalTier_cutPartsMatchingLabel ((IntervalTier) anyTier, thee, label, precision);
278 his tiers -> addItem_move (newTier.move());
279 } else {
280 autoTextTier newTier = TextTier_IntervalTier_cutPartsMatchingLabel ((TextTier) anyTier, thee, label, precision);
281 his tiers -> addItem_move (newTier.move());
282 }
283 }
284 return him;
285 } catch (MelderError) {
286 Melder_throw (me, U": no parts cut.");
287 }
288 }
289
290 // Patch thy intervals that match patchLabel into my intervals
291 // The resulting IntervalTier has thy xmin as starting time and thy xmax as end time
SpeechSynthesizer_setTextInputSettings(SpeechSynthesizer me,int inputTextFormat,int inputPhonemeCoding)292 autoIntervalTier IntervalTiers_patch_noBoundaries (IntervalTier me, IntervalTier thee, conststring32 patchLabel, double precision) {
293 try {
294 autoVEC durations = zero_VEC (my intervals.size + 1);
295 for (integer i = 1; i <= my intervals.size; i ++) {
296 const TextInterval myti = my intervals.at [i];
297 durations [i] = myti -> xmax - myti -> xmin;
298 }
299 integer myInterval = 1;
300 double xShift = thy xmin - my xmin, firstShift = 0.0;
301 for (integer interval = 1; interval <= thy intervals.size; interval ++) {
302 const TextInterval patch = thy intervals.at [interval];
303 if (Melder_equ (patch -> text.get(), patchLabel)) {
304 if (interval == 1)
305 xShift += firstShift = patch -> xmax - patch -> xmin;
306 else if (interval == thy intervals.size)
307 durations [my intervals.size + 1] = patch -> xmax - patch -> xmin;
308 else
309 while (myInterval <= my intervals.size) {
310 const TextInterval ti = my intervals.at [myInterval];
311 const double tixmin = ti -> xmin + xShift;
312 const double tixmax = ti -> xmax + xShift;
313 if ((patch -> xmin > tixmin - precision) && (patch -> xmin < tixmax + precision)) {
314 durations [myInterval] += patch -> xmax - patch -> xmin;
315 break;
316 }
317 myInterval++;
318 }
319 } else
320 while (myInterval <= my intervals.size) {
321 TextInterval ti = my intervals.at [myInterval];
322 double tixmax = ti -> xmax + xShift;
323 if (tixmax < patch -> xmin + precision)
324 myInterval ++;
325 else
326 break;
327 }
328 }
329 autoIntervalTier him = IntervalTier_create (thy xmin, thy xmax);
330 // first interval
331 double time = thy xmin + firstShift;
332 integer hisInterval = 1;
333 if (firstShift > 0.0) {
334 IntervalTier_splitInterval (him.get(), time , U"", hisInterval, precision);
335 hisInterval ++;
336 }
337 for (integer interval = 1; interval <= my intervals.size; interval ++) {
338 const TextInterval ti = my intervals.at [interval];
339 time += durations [interval];
340 IntervalTier_splitInterval (him.get(), time, ti -> text.get(), hisInterval, precision);
341 hisInterval ++;
342 }
343 if (durations [my intervals.size + 1] > 0.0) {
344 time += durations [my intervals.size + 1];
345 IntervalTier_splitInterval (him.get(), time , U"", hisInterval, precision);
346 }
347 return him;
348 } catch (MelderError) {
349 Melder_throw (me, U": not patched.");
350 }
351 }
352
353 #if 0
354 static autoIntervalTier IntervalTiers_patch (IntervalTier me, IntervalTier thee, conststring32 patchLabel, double precision) {
355 try {
356 utoIntervalTier him = IntervalTier_create (thy xmin, thy xmax);
357 integer myInterval = 1, hisInterval = 1;
358 double xmax = thy xmin;
359 for (integer i = 1; i <= thy intervals.size; i ++) {
360 TextInterval myti, ti = thy intervals.at [i];
361 if (Melder_equ (ti -> text, patchLabel)) {
362 bool splitInterval = false;
363 double endtime, split = 0.0;
364 BUG if (i > 0) {
365 while (myInterval <= my intervals.size) {
366 myti = my intervals.at [myInterval];
367 endtime = xmax + myti -> xmax - myti -> xmin;
368 if (endtime <= ti -> xmin + precision) {
369 xmax = endtime;
370 IntervalTier_splitInterval (him.get(), xmax, myti -> text, hisInterval, precision);
371 hisInterval ++;
372 } else {
373 if (xmax < ti -> xmin - precision) { // split interval ???
374 splitInterval = true;
375 xmax = ti -> xmin;
MelderString_trimWhiteSpaceAtEnd(MelderString * me)376 split = endtime - xmax;
377 IntervalTier_splitInterval (him.get(), xmax, myti -> text, hisInterval, precision);
378 hisInterval ++;
379 myInterval ++;
380 }
381 break;
IntervalTier_mergeSpecialIntervals(IntervalTier me)382 }
383 myInterval ++;
384 }
385 }
386 xmax += ti -> xmax - ti -> xmin;
387 IntervalTier_splitInterval (him.get(), xmax, U"", hisInterval, precision);
388 hisInterval ++;
389 if (splitInterval) {
390 xmax += split;
391 IntervalTier_splitInterval (him.get(), xmax, myti -> text, hisInterval, precision);
392 hisInterval ++;
393 }
394 } else if (i == thy intervals.size) { // copy remaining if last interval doesn't match
395 while (myInterval <= my intervals.size) {
396 myti = my intervals.at [myInterval];
397 xmax += myti -> xmax - myti -> xmin;
398 IntervalTier_splitInterval (him.get(), xmax, myti -> text, hisInterval, precision);
399 hisInterval ++;
400 myInterval ++;
401 }
402 }
403 }
404 return him;
405 } catch (MelderError) {
406 Melder_throw (me, U": not patched.");
407 }
408 }
409 #endif
410
411 static autoTextTier TextTier_IntervalTier_patch (TextTier me, IntervalTier thee, conststring32 patchLabel, double precision) {
412 try {
413 integer myIndex = 1;
414 autoTextTier him = TextTier_create (thy xmin, thy xmax);
415 double xShift = thy xmin - my xmin;
416 for (integer i = 1; i <= thy intervals.size; i ++) {
417 const TextInterval ti = thy intervals.at [i];
418 if (Melder_equ (ti -> text.get(), patchLabel)) {
419 if (i > 1) {
420 while (myIndex <= my points.size) {
421 const TextPoint tp = my points.at [myIndex];
422 const double time = tp -> number + xShift;
423 if (time < ti -> xmin + precision) {
424 autoTextPoint newPoint = TextPoint_create (time, tp -> mark.get());
425 his points. addItem_move (newPoint.move());
426 } else {
427 break;
428 }
429 myIndex ++;
430 }
431 }
almost_equal(double t1,double t2)432 xShift += ti -> xmax - ti -> xmin;
433 } else if (i == thy intervals.size) {
434 while (myIndex <= my points.size) {
435 const TextPoint tp = my points.at [myIndex];
436 const double time = tp -> number + xShift;
IntervalTier_insertEmptyIntervalsFromOtherTier(IntervalTier to,IntervalTier from)437 if (time < ti -> xmin + precision) {
438 autoTextPoint newPoint = TextPoint_create (time, tp -> mark.get());
439 his points. addItem_move (newPoint.move());
440 }
441 myIndex ++;
442 }
443 }
444 }
445 return him;
446 } catch (MelderError) {
447 Melder_throw (me, U": cannot patch TextTier.");
448 }
449 }
450
451 autoTextGrid TextGrid_IntervalTier_patch (TextGrid me, IntervalTier thee, conststring32 patchLabel, double precision) {
452 try {
453 double patchDurations;
454 integer numberOfPatches;
455 IntervalTier_getLabelInfo (thee, patchLabel, & patchDurations, & numberOfPatches);
456 if (patchDurations <= 0 || my xmax - my xmin >= thy xmax - thy xmin ) // Nothing to patch
457 return Data_copy (me);
458 autoTextGrid him = TextGrid_createWithoutTiers (thy xmin, thy xmax);
459 for (integer itier = 1; itier <= my tiers->size; itier ++) {
460 const Function anyTier = my tiers->at [itier];
461 if (anyTier -> classInfo == classIntervalTier) {
462 //autoIntervalTier ait = IntervalTiers_patch ((IntervalTier) anyTier, thee, patchLabel, precision);
463 autoIntervalTier newTier = IntervalTiers_patch_noBoundaries ((IntervalTier) anyTier, thee, patchLabel, precision);
IntervalTier_removeVeryShortIntervals(IntervalTier me)464 his tiers -> addItem_move (newTier.move());
465 } else {
466 autoTextTier newTier = TextTier_IntervalTier_patch ((TextTier) anyTier, thee, patchLabel, precision);
467 his tiers -> addItem_move (newTier.move());
468 }
469 }
470 return him;
471 } catch (MelderError) {
472 Melder_throw (me, U": not patched.");
473 }
474 }
Table_to_TextGrid(Table me,conststring32 text,double xmin,double xmax)475
476 // We assume that the Sound and the SpeechSynthesizer have the same samplingFrequency
477 autoTextGrid SpeechSynthesizer_Sound_TextInterval_align (SpeechSynthesizer me, Sound thee, TextInterval him, double silenceThreshold, double minSilenceDuration, double minSoundingDuration) {
478 try {
479 Melder_require (thy xmin == his xmin && thy xmax == his xmax,
480 U"Domains of Sound and TextGrid should be equal.");
481 Melder_require (fabs (1.0 / thy dx - my d_samplingFrequency) < 1e-9,
482 U"The sampling frequencies of the SpeechSynthesizer and the Sound should be equal.");
483
484 autoSTRVEC tokens = splitByWhitespace_STRVEC (his text.get());
485 const integer numberOfTokens = tokens.size;
486 Melder_require (numberOfTokens > 0,
487 U"The interval should contain text.");
488 /*
489 Remove silent intervals from start and end of sounds because
490 1. it will improve the word rate guess
491 2. it will improve the DTW matching.
492 */
493 const double minPitch = 200.0, timeStep = 0.005, precision = thy dx;
494 double startTimeOfSounding, endTimeOfSounding;
495 autoSound soundTrimmed = Sound_trimSilencesAtStartAndEnd (thee, 0.0, minPitch, timeStep, silenceThreshold, minSilenceDuration, minSoundingDuration, & startTimeOfSounding, & endTimeOfSounding);
496 const double duration_soundTrimmed = soundTrimmed -> xmax - soundTrimmed -> xmin;
497 const bool hasSilence_sound = fabs (startTimeOfSounding - thy xmin) > precision || fabs (endTimeOfSounding - thy xmax) > precision;
498
499 if (my d_estimateSpeechRate) {
500 /*
501 Estimate speaking rate with the number of words per minute from the text
502 */
503 const double wordsPerMinute_rawTokens = 60.0 * numberOfTokens / duration_soundTrimmed;
504 /*
505 Compensation for long words: 5 characters / word
506 */
507 const double wordsPerMinute_rawText = 60.0 * (str32len (his text.get()) / 5.0) / duration_soundTrimmed;
508 my d_wordsPerMinute = Melder_ifloor (0.5 * (wordsPerMinute_rawTokens + wordsPerMinute_rawText));
509 }
510
511 autoTextGrid textgrid_synth, textgrid_synth_sounding;
512 autoSound synth = SpeechSynthesizer_TextInterval_to_Sound (me, him, & textgrid_synth);
513 /*
514 For the synthesizer the silence threshold has to be < -30 dB, otherwise fricatives will not
515 be found as sounding! This is ok since silences are almost at zero amplitudes for synthesized sounds.
516 We also have to decrease the minimum silence and minimum sounding duration to catch, for example,
517 the final plosive "t" from the synthesized sound "text".
518 */
519 const double silenceThreshold_synth = -40.0, minSilenceDuration_synth = 0.05;
520 const double minSoundingDuration_synth = 0.05;
521 double startTimeOfSounding_synth, endTimeOfSounding_synth;
522 autoSound synthTrimmed = Sound_trimSilencesAtStartAndEnd (synth.get(), 0.0, minPitch, timeStep, silenceThreshold_synth,
523 minSilenceDuration_synth, minSoundingDuration_synth, & startTimeOfSounding_synth, & endTimeOfSounding_synth);
524 const double synthTrimmed_duration = synthTrimmed -> xmax - synthTrimmed -> xmin;
525 const bool hasSilence_synth = fabs (startTimeOfSounding_synth - synth -> xmin) > precision ||
526 fabs (endTimeOfSounding_synth - synth -> xmax) > precision;
527
528 if (hasSilence_synth)
529 textgrid_synth_sounding = TextGrid_extractPart (textgrid_synth.get(), startTimeOfSounding_synth, endTimeOfSounding_synth, true);
530 /*
531 Compare the durations of the two sounds to get an indication of the slope constraint needed for the DTW
532 */
533 double slope = duration_soundTrimmed / synthTrimmed_duration;
534 slope = ( slope > 1.0 ? slope : 1.0 / slope );
535 const int constraint = ( slope < 1.5 ? 4 : slope < 2.0 ? 3 : slope < 3.0 ? 2 : 1 ); // TODO enums
536
537 const double analysisWidth = 0.02, dt = 0.005, band = 0.0;
538 autoDTW dtw = Sounds_to_DTW ((hasSilence_sound ? soundTrimmed.get() : thee),
539 (hasSilence_synth ? synthTrimmed.get() : synth.get()), analysisWidth, dt, band, constraint);
540
541 autoTextGrid result = DTW_TextGrid_to_TextGrid (dtw.get(), (hasSilence_synth ? textgrid_synth_sounding.get() : textgrid_synth.get()), precision);
542 if (hasSilence_sound) {
543 if (startTimeOfSounding > thy xmin)
544 TextGrid_setEarlierStartTime (result.get(), thy xmin, U"", U"");
545 if (endTimeOfSounding < thy xmax || result -> xmax < thy xmax)
546 TextGrid_setLaterEndTime (result.get(), thy xmax, U"", U"");
547 }
548 return result;
549 } catch (MelderError) {
550 Melder_throw (U"Sound and TextInterval not aligned.");
551 }
552 }
553 /*
554 typedef struct structAlignmentOfSoundAndTextStruct {
555 double windowLength, timeStep; // analysis
556 double f1_mel, fmax_mel, df_mel; // MelFilter
557 integer numberOfMFCCCoefficients; // MFCC
558 double dtw_cepstralWeight, dtw_logEnergyWeight; // MFCC -> DTW
559 double dtw_regressionWeight, dtw_regressionlogEnergyWeight;
560 double dtw_regressionWindowLength;
561 double dtw_sakoeChibaBand, dtw_constraint;
562 double silenceThreshold, minSilenceDuration, minSoundingDuration, trimDuration; // silence detection
563 integer language, voicevariant, pitchAdjustment, pitchRange, wordsPerMinute; // synthesizer
564 bool interpretPhonemeCodes, ipa, set_wordsPerMinute;
565 double wordgap; // synthesizer
566 } *SpeechSynthesizer_alignmentStruct;*/
567
568 static autoTextGrid SpeechSynthesizer_Sound_TextInterval_align2 (SpeechSynthesizer me, Sound thee, TextInterval him, double silenceThreshold, double minSilenceDuration, double minSoundingDuration, double trimDuration) {
569 try {
570 Melder_require (thy xmin == his xmin && thy xmax == his xmax,
571 U"Domains of Sound and TextGrid should be equal.");
572 Melder_require (fabs (1.0 / thy dx - my d_samplingFrequency) < 1e-9,
573 U"The sampling frequencies of the SpeechSynthesizer and the Sound should be equal.");
574
575 const conststring32 trimLabel = U"trim";
576 /*
577 1. Trim the silences of the sound
578
579 For the synthesizer the silence threshold has to be < -30 dB, otherwise fricatives
580 will not be found as sounding! This is ok since silences are almost at zero amplitudes
581 We also have to decrease the minimum silence and minimum sounding duration to catch,
582 for example, the final plosive "t" from the word "text"
583 */
584 const double minPitch = 200, timeStep = 0.005, precision = thy dx;
585 autoTextGrid thee_trimmer;
586 autoSound thee_trimmed = Sound_trimSilences (thee, trimDuration, false, minPitch, timeStep, silenceThreshold, minSilenceDuration, minSoundingDuration, &thee_trimmer, trimLabel);
587 /*
588 2. Synthesize the sound from the TextInterval
589 */
590 autoTextGrid tg_syn;
591 autoSound synth = SpeechSynthesizer_TextInterval_to_Sound (me, him, &tg_syn);
592 /*
593 3. There should be no silences in the synthesized sound except at the start and finish.
594 Set the wordwarp parameter to a small value like 0.001 s.
595
596 4. Get DTW from the two sounds
597 */
598 const double analysisWidth = 0.02, dt = 0.005, band = 0.0;
599 const int constraint = 4;
600 autoDTW dtw = Sounds_to_DTW (thee_trimmed.get(), synth.get(), analysisWidth, dt, band, constraint);
601 /*
602 6. Warp the synthesis TextGrid
603 First make domains equal, otherwsise the warper protests
604 */
SpeechSynthesizer_to_Sound(SpeechSynthesizer me,conststring32 text,autoTextGrid * tg,autoTable * events)605 autoTextGrid warp = DTW_TextGrid_to_TextGrid (dtw.get(), tg_syn.get(), precision);
606 /*
607 6. Patch the trimmed intervals back into the warped TextGrid
608 */
609 autoTextGrid result = TextGrid_IntervalTier_patch (warp.get(), (IntervalTier) thee_trimmer -> tiers->at [1], U"trim", 2 * thy dx);
610
611 return result;
612 } catch (MelderError) {
613 Melder_throw (thee, U": sound and TextInterval not aligned.");
614 }
615 }
616
617 autoTextGrid SpeechSynthesizer_Sound_IntervalTier_align (SpeechSynthesizer me, Sound thee, IntervalTier him, integer istart, integer iend, double silenceThreshold, double minSilenceDuration, double minSoundingDuration) {
618 try {
619 IntervalTier_checkRange (him, istart, iend);
620 const TextInterval tib = his intervals.at [istart];
621 const TextInterval tie = his intervals.at [iend];
622 Melder_require (tib -> xmin >= thy xmin && tie -> xmax <= thy xmax,
623 U"The chosen interval(s) must lie within the sound.");
624 OrderedOf<structTextGrid> textgrids;
625 autoTextGrid result = TextGrid_create (tib -> xmin, tie -> xmax, U"sentence clause word phoneme", U"");
626 for (integer iint = istart; iint <= iend; iint ++) {
627 const TextInterval ti = his intervals.at [iint];
628 if (ti -> text && ti -> text [0] != U'\0') {
629 autoSound sound = Sound_extractPart (thee, ti -> xmin, ti -> xmax, kSound_windowShape::RECTANGULAR, 1, true);
630 autoTextGrid grid = SpeechSynthesizer_Sound_TextInterval_align (me, sound.get(), ti, silenceThreshold, minSilenceDuration, minSoundingDuration);
631 textgrids. addItem_move (grid.move());
632 }
633 }
634 Melder_require (textgrids.size > 0,
635 U"Nothing could be aligned. Was your IntervalTier empty?");
636 autoTextGrid aligned = TextGrids_to_TextGrid_appendContinuous (& textgrids, true);
637 return aligned;
638 } catch (MelderError) {
639 Melder_throw (U"No aligned TextGrid created.");
640 }
641 }
642
643 static autoTextGrid SpeechSynthesizer_Sound_IntervalTier_align2 (SpeechSynthesizer me, Sound thee, IntervalTier him, integer istart, integer iend, double silenceThreshold, double minSilenceDuration, double minSoundingDuration, double trimDuration) {
644 try {
645 IntervalTier_checkRange (him, istart, iend);
646 const TextInterval tb = his intervals.at [istart];
647 const TextInterval te = his intervals.at [iend];
648 autoTextGrid result = TextGrid_create (tb -> xmin, te -> xmax, U"sentence clause word phoneme", U"");
649 OrderedOf<structTextGrid> textgrids;
650 for (integer iint = istart; iint <= iend; iint ++) {
651 const TextInterval ti = his intervals.at [iint];
652 if (ti -> text && ti -> text [0] != U'\0') {
653 autoSound sound = Sound_extractPart (thee, ti -> xmin, ti -> xmax, kSound_windowShape::RECTANGULAR, 1, true);
654 autoTextGrid grid = SpeechSynthesizer_Sound_TextInterval_align2 (me, sound.get(), ti, silenceThreshold, minSilenceDuration, minSoundingDuration, trimDuration);
655 textgrids. addItem_move (grid.move());
656 }
657 }
658 Melder_require (textgrids.size > 0, U"Nothing could be aligned. Was your IntervalTier empty?");
659
660 autoTextGrid aligned = TextGrids_to_TextGrid_appendContinuous (& textgrids, true);
661 return aligned;
662 } catch (MelderError) {
663 Melder_throw (U"No aligned TextGrid created.");
664 }
665 }
666
667 autoTextGrid SpeechSynthesizer_Sound_TextGrid_align (SpeechSynthesizer me, Sound thee, TextGrid him, integer tierNumber, integer istart, integer iend, double silenceThreshold, double minSilenceDuration, double minSoundingDuration) {
668 try {
669 Melder_require (thy xmin == his xmin && thy xmax == his xmax,
670 U"The domains of the Sound and the TextGrid must be equal.");
671 const IntervalTier tier = TextGrid_checkSpecifiedTierIsIntervalTier (him, tierNumber);
672 autoTextGrid grid = SpeechSynthesizer_Sound_IntervalTier_align (me, thee, tier, istart, iend, silenceThreshold, minSilenceDuration, minSoundingDuration);
673 return grid;
674 } catch (MelderError) {
675 Melder_throw (me, U", ", thee, U", ", him, U": Cannot align.");
676 }
677 }
678
679
680 autoTextGrid SpeechSynthesizer_Sound_TextGrid_align2 (SpeechSynthesizer me, Sound thee, TextGrid him, integer tierNumber, integer istart, integer iend, double silenceThreshold, double minSilenceDuration, double minSoundingDuration, double trimDuration) {
681 try {//TODO: check not empty tier
682 const IntervalTier tier = TextGrid_checkSpecifiedTierIsIntervalTier (him, tierNumber);
683 autoTextGrid grid = SpeechSynthesizer_Sound_IntervalTier_align2 (me, thee, tier, istart, iend, silenceThreshold, minSilenceDuration, minSoundingDuration, trimDuration);
684 return grid;
685 } catch (MelderError) {
686 Melder_throw (U"");
687 }
688 }
689
690 static autoStrings IntervalTier_to_Strings_withOriginData (IntervalTier me, INTVEC from) {
691 try {
692 autoStrings thee = Thing_new (Strings);
693 thy strings = autoSTRVEC (my intervals.size);
694 for (integer i = 1; i <= my intervals.size; i ++) {
695 const TextInterval ti = my intervals.at [i];
696 if (ti -> text && ti -> text [0] != U'\0') {
697 thy strings [++ thy numberOfStrings] = Melder_dup (ti -> text.get());
698 from [thy numberOfStrings] = i;
699 }
700 }
701 return thee;
702 } catch (MelderError) {
703 Melder_throw (me, U": no Strings created.");
704 }
705 }
706
707 autoTable IntervalTiers_to_Table_textAlignmentment (IntervalTier target, IntervalTier source, EditCostsTable costs) {
708 try {
709 const integer numberOfTargetIntervals = target -> intervals.size;
710 const integer numberOfSourceIntervals = source -> intervals.size;
711 autoINTVEC targetOrigin = zero_INTVEC (numberOfTargetIntervals);
712 autoINTVEC sourceOrigin = zero_INTVEC (numberOfSourceIntervals);
713 autoStrings targets = IntervalTier_to_Strings_withOriginData (target, targetOrigin.get());
714 autoStrings sources = IntervalTier_to_Strings_withOriginData (source, sourceOrigin.get());
715 autoEditDistanceTable edit = EditDistanceTable_create (targets.get(), sources.get());
716 if (costs) {
717 EditDistanceTable_setEditCosts (edit.get(), costs);
718 EditDistanceTable_findPath (edit.get(), nullptr);
719 }
720 const integer pathLength = edit -> warpingPath -> pathLength;
721 const conststring32 columnNames [] = {
722 U"targetInterval", U"targetText", U"targetStart", U"targetEnd",
723 U"sourceInterval", U"sourceText", U"sourceStart", U"sourceEnd",
724 U"operation"
725 };
726 autoTable thee = Table_createWithColumnNames (pathLength - 1, ARRAY_TO_STRVEC (columnNames));
727 for (integer i = 2; i <= pathLength; i++) {
728 const structPairOfInteger p = edit -> warpingPath -> path [i];
729 const structPairOfInteger p1 = edit -> warpingPath -> path [i - 1];
730 double targetStart = undefined, targetEnd = undefined;
731 double sourceStart = undefined, sourceEnd = undefined;
732 conststring32 targetText = U"", sourceText = U"";
733 const integer targetInterval = ( p.y > 1 ? targetOrigin [p.y - 1] : 0 );
734 const integer sourceInterval = ( p.x > 1 ? sourceOrigin [p.x - 1] : 0 );
735 if (targetInterval > 0) {
736 const TextInterval ti = target -> intervals.at [targetInterval];
737 targetStart = ti -> xmin;
738 targetEnd = ti -> xmax;
739 targetText = ti -> text.get();
740 }
741 if (sourceInterval > 0) {
742 const TextInterval ti = source -> intervals.at [sourceInterval];
743 sourceStart = ti -> xmin;
744 sourceEnd = ti -> xmax;
745 sourceText = ti -> text.get();
746 }
747 const integer irow = i - 1;
748 if (p.y == p1.y) { // deletion
749 Table_setNumericValue (thee.get(), irow, 1, 0);
750 Table_setStringValue (thee.get(), irow, 2, U"");
751 Table_setNumericValue (thee.get(), irow, 3, undefined);
752 Table_setNumericValue (thee.get(), irow, 4, undefined);
753 Table_setNumericValue (thee.get(), irow, 5, sourceInterval);
754 Table_setStringValue (thee.get(), irow, 6, sourceText);
755 Table_setNumericValue (thee.get(), irow, 7, sourceStart);
756 Table_setNumericValue (thee.get(), irow, 8, sourceEnd);
757 Table_setStringValue (thee.get(), irow, 9, U"d");
758 } else if (p.x == p1.x) { // insertion
759 Table_setNumericValue (thee.get(), irow, 1, targetInterval);
760 Table_setStringValue (thee.get(), irow, 2, targetText);
761 Table_setNumericValue (thee.get(), irow, 3, targetStart);
762 Table_setNumericValue (thee.get(), irow, 4, targetEnd);
763 Table_setNumericValue (thee.get(), irow, 5, 0);
764 Table_setStringValue (thee.get(), irow, 6, U"");
765 Table_setNumericValue (thee.get(), irow, 7, undefined);
766 Table_setNumericValue (thee.get(), irow, 8, undefined);
767 Table_setStringValue (thee.get(), irow, 9, U"i");
768 } else { // substitution ?
769 Table_setNumericValue (thee.get(), irow, 1, targetInterval);
770 Table_setStringValue (thee.get(), irow, 2, targetText);
771 Table_setNumericValue (thee.get(), irow, 3, targetStart);
772 Table_setNumericValue (thee.get(), irow, 4, targetEnd);
773 Table_setNumericValue (thee.get(), irow, 5, sourceInterval);
774 Table_setStringValue (thee.get(), irow, 6, sourceText);
775 Table_setNumericValue (thee.get(), irow, 7, sourceStart);
776 Table_setNumericValue (thee.get(), irow, 8, sourceEnd);
777 Table_setStringValue (thee.get(), irow, 9, Melder_equ (targetText, sourceText) ? U" " : U"s");
778 }
779 }
780 return thee;
781 } catch (MelderError) {
782 Melder_throw (target, U" and ", source, U" not aligned.");
783 }
784 }
785
786 autoTable TextGrids_to_Table_textAlignment (TextGrid target, integer ttier, TextGrid source, integer stier, EditCostsTable costs) {
787 try {
788 const IntervalTier targetTier = TextGrid_checkSpecifiedTierIsIntervalTier (target, ttier);
789 const IntervalTier sourceTier = TextGrid_checkSpecifiedTierIsIntervalTier (source, stier);
790 return IntervalTiers_to_Table_textAlignmentment (targetTier, sourceTier, costs);
791 } catch (MelderError) {
792 Melder_throw (U"No text alignment table created from TextGrids ", target, U" and ", source, U".");
793 }
794 }
795
796 // End of file TextGrid_and_SpeechSynthesizer.cpp
797