1 /*
2 * Copyright (C) 2005 to 2014 by Jonathan Duddington
3 * email: jonsd@users.sourceforge.net
4 * Copyright (C) 2015-2017 Reece H. Dunn
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, see: <http://www.gnu.org/licenses/>.
18 */
19
20 #include "config.h"
21
22 #include <ctype.h>
23 #include <errno.h>
24 #include <math.h>
25 //#include <stdbool.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30
31 #include "espeak_ng.h"
32 #include "encoding.h"
33 #include "speech.h"
34 #include "synthesize.h"
35 #include "translate.h"
36
37 extern FILE *f_log;
38 static void SmoothSpect(void);
39
40 // list of phonemes in a clause
41 int n_phoneme_list = 0;
42 PHONEME_LIST phoneme_list[N_PHONEME_LIST+1];
43
44 int mbrola_delay;
45 char mbrola_name[20];
46
47 SPEED_FACTORS speed;
48
49 static int last_pitch_cmd;
50 static int last_amp_cmd;
51 static frame_t *last_frame;
52 static int last_wcmdq;
53 static int pitch_length;
54 static int amp_length;
55 static int modn_flags;
56 static int fmt_amplitude = 0;
57
58 static int syllable_start;
59 static int syllable_end;
60 static int syllable_centre;
61
62 static voice_t *new_voice = NULL;
63
64 int n_soundicon_tab = N_SOUNDICON_SLOTS;
65 SOUND_ICON soundicon_tab[N_SOUNDICON_TAB];
66
67 #define RMS_GLOTTAL1 35 // vowel before glottal stop
68 #define RMS_START 28 // 28
69
70 #define VOWEL_FRONT_LENGTH 50
71
72 // a dummy phoneme_list entry which looks like a pause
73 static PHONEME_LIST next_pause;
74
WordToString(unsigned int word)75 const char *WordToString(unsigned int word)
76 {
77 // Convert a phoneme mnemonic word into a string
78 int ix;
79 static char buf[5];
80
81 for (ix = 0; ix < 4; ix++)
82 buf[ix] = word >> (ix*8);
83 buf[4] = 0;
84 return buf;
85 }
86
SynthesizeInit()87 void SynthesizeInit()
88 {
89 last_pitch_cmd = 0;
90 last_amp_cmd = 0;
91 last_frame = NULL;
92 syllable_centre = -1;
93
94 // initialise next_pause, a dummy phoneme_list entry
95 next_pause.type = phPAUSE;
96 next_pause.newword = 0;
97 }
98
EndAmplitude(void)99 static void EndAmplitude(void)
100 {
101 if (amp_length > 0) {
102 if (wcmdq[last_amp_cmd][1] == 0)
103 wcmdq[last_amp_cmd][1] = amp_length;
104 amp_length = 0;
105 }
106 }
107
EndPitch(int voice_break)108 static void EndPitch(int voice_break)
109 {
110 // posssible end of pitch envelope, fill in the length
111 if ((pitch_length > 0) && (last_pitch_cmd >= 0)) {
112 if (wcmdq[last_pitch_cmd][1] == 0)
113 wcmdq[last_pitch_cmd][1] = pitch_length;
114 pitch_length = 0;
115 }
116
117 if (voice_break) {
118 last_wcmdq = -1;
119 last_frame = NULL;
120 syllable_end = wcmdq_tail;
121 SmoothSpect();
122 syllable_centre = -1;
123 memset(vowel_transition, 0, sizeof(vowel_transition));
124 }
125 }
126
DoAmplitude(int amp,unsigned char * amp_env)127 static void DoAmplitude(int amp, unsigned char *amp_env)
128 {
129 intptr_t *q;
130
131 last_amp_cmd = wcmdq_tail;
132 amp_length = 0; // total length of vowel with this amplitude envelope
133
134 q = wcmdq[wcmdq_tail];
135 q[0] = WCMD_AMPLITUDE;
136 q[1] = 0; // fill in later from amp_length
137 q[2] = (intptr_t)amp_env;
138 q[3] = amp;
139 WcmdqInc();
140 }
141
DoPitch(unsigned char * env,int pitch1,int pitch2)142 static void DoPitch(unsigned char *env, int pitch1, int pitch2)
143 {
144 intptr_t *q;
145
146 EndPitch(0);
147
148 if (pitch1 == 255) {
149 // pitch was not set
150 pitch1 = 55;
151 pitch2 = 76;
152 env = envelope_data[PITCHfall];
153 }
154 last_pitch_cmd = wcmdq_tail;
155 pitch_length = 0; // total length of spect with this pitch envelope
156
157 if (pitch2 < 0)
158 pitch2 = 0;
159
160 q = wcmdq[wcmdq_tail];
161 q[0] = WCMD_PITCH;
162 q[1] = 0; // length, fill in later from pitch_length
163 q[2] = (intptr_t)env;
164 q[3] = (pitch1 << 16) + pitch2;
165 WcmdqInc();
166 }
167
PauseLength(int pause,int control)168 int PauseLength(int pause, int control)
169 {
170 unsigned int len;
171
172 if (control == 0) {
173 if (pause >= 200)
174 len = (pause * speed.clause_pause_factor)/256;
175 else
176 len = (pause * speed.pause_factor)/256;
177 } else
178 len = (pause * speed.wav_factor)/256;
179
180 if (len < speed.min_pause)
181 len = speed.min_pause; // mS, limit the amount to which pauses can be shortened
182 return len;
183 }
184
DoPause(int length,int control)185 static void DoPause(int length, int control)
186 {
187 // length in nominal mS
188 // control = 1, less shortening at fast speeds
189
190 unsigned int len;
191 int srate2;
192
193 if (length == 0)
194 len = 0;
195 else {
196 len = PauseLength(length, control);
197
198 if (len < 90000)
199 len = (len * samplerate) / 1000; // convert from mS to number of samples
200 else {
201 srate2 = samplerate / 25; // avoid overflow
202 len = (len * srate2) / 40;
203 }
204 }
205
206 EndPitch(1);
207 wcmdq[wcmdq_tail][0] = WCMD_PAUSE;
208 wcmdq[wcmdq_tail][1] = len;
209 WcmdqInc();
210 last_frame = NULL;
211
212 if (fmt_amplitude != 0) {
213 wcmdq[wcmdq_tail][0] = WCMD_FMT_AMPLITUDE;
214 wcmdq[wcmdq_tail][1] = fmt_amplitude = 0;
215 WcmdqInc();
216 }
217 }
218
219 extern int seq_len_adjust; // temporary fix to advance the start point for playing the wav sample
220
DoSample2(int index,int which,int std_length,int control,int length_mod,int amp)221 static int DoSample2(int index, int which, int std_length, int control, int length_mod, int amp)
222 {
223 int length;
224 int wav_length;
225 int wav_scale;
226 int min_length;
227 int x;
228 int len4;
229 intptr_t *q;
230 unsigned char *p;
231
232 index = index & 0x7fffff;
233 p = &wavefile_data[index];
234 wav_scale = p[2];
235 wav_length = (p[1] * 256);
236 wav_length += p[0]; // length in bytes
237
238 if (wav_length == 0)
239 return 0;
240
241 min_length = speed.min_sample_len;
242
243 if (wav_scale == 0)
244 min_length *= 2; // 16 bit samples
245
246 if (std_length > 0) {
247 std_length = (std_length * samplerate)/1000;
248 if (wav_scale == 0)
249 std_length *= 2;
250
251 x = (min_length * std_length)/wav_length;
252 if (x > min_length)
253 min_length = x;
254 } else {
255 // no length specified, use the length of the stored sound
256 std_length = wav_length;
257 }
258
259 if (length_mod > 0)
260 std_length = (std_length * length_mod)/256;
261
262 length = (std_length * speed.wav_factor)/256;
263
264 if (control & pd_DONTLENGTHEN) {
265 // this option is used for Stops, with short noise bursts.
266 // Don't change their length much.
267 if (length > std_length) {
268 // don't let length exceed std_length
269 length = std_length;
270 }
271 }
272
273 if (length < min_length)
274 length = min_length;
275
276
277 if (wav_scale == 0) {
278 // 16 bit samples
279 length /= 2;
280 wav_length /= 2;
281 }
282
283 if (amp < 0)
284 return length;
285
286 len4 = wav_length / 4;
287
288 index += 4;
289
290 if (which & 0x100) {
291 // mix this with synthesised wave
292 last_wcmdq = wcmdq_tail;
293 q = wcmdq[wcmdq_tail];
294 q[0] = WCMD_WAVE2;
295 q[1] = length | (wav_length << 16); // length in samples
296 q[2] = (intptr_t)(&wavefile_data[index]);
297 q[3] = wav_scale + (amp << 8);
298 WcmdqInc();
299 return length;
300 }
301
302 if (length > wav_length) {
303 x = len4*3;
304 length -= x;
305 } else {
306 x = length;
307 length = 0;
308 }
309
310 last_wcmdq = wcmdq_tail;
311 q = wcmdq[wcmdq_tail];
312 q[0] = WCMD_WAVE;
313 q[1] = x; // length in samples
314 q[2] = (intptr_t)(&wavefile_data[index]);
315 q[3] = wav_scale + (amp << 8);
316 WcmdqInc();
317
318 while (length > len4*3) {
319 x = len4;
320 if (wav_scale == 0)
321 x *= 2;
322
323 last_wcmdq = wcmdq_tail;
324 q = wcmdq[wcmdq_tail];
325 q[0] = WCMD_WAVE;
326 q[1] = len4*2; // length in samples
327 q[2] = (intptr_t)(&wavefile_data[index+x]);
328 q[3] = wav_scale + (amp << 8);
329 WcmdqInc();
330
331 length -= len4*2;
332 }
333
334 if (length > 0) {
335 x = wav_length - length;
336 if (wav_scale == 0)
337 x *= 2;
338 last_wcmdq = wcmdq_tail;
339 q = wcmdq[wcmdq_tail];
340 q[0] = WCMD_WAVE;
341 q[1] = length; // length in samples
342 q[2] = (intptr_t)(&wavefile_data[index+x]);
343 q[3] = wav_scale + (amp << 8);
344 WcmdqInc();
345 }
346
347 return length;
348 }
349
DoSample3(PHONEME_DATA * phdata,int length_mod,int amp)350 int DoSample3(PHONEME_DATA *phdata, int length_mod, int amp)
351 {
352 int amp2;
353 int len;
354 EndPitch(1);
355
356 if (amp == -1) {
357 // just get the length, don't produce sound
358 amp2 = amp;
359 } else {
360 amp2 = phdata->sound_param[pd_WAV];
361 if (amp2 == 0)
362 amp2 = 100;
363 amp2 = (amp2 * 32)/100;
364 }
365
366 seq_len_adjust = 0;
367
368 if (phdata->sound_addr[pd_WAV] == 0)
369 len = 0;
370 else
371 len = DoSample2(phdata->sound_addr[pd_WAV], 2, phdata->pd_param[pd_LENGTHMOD]*2, phdata->pd_control, length_mod, amp2);
372 last_frame = NULL;
373 return len;
374 }
375
AllocFrame()376 static frame_t *AllocFrame()
377 {
378 // Allocate a temporary spectrum frame for the wavegen queue. Use a pool which is big
379 // enough to use a round-robin without checks.
380 // Only needed for modifying spectra for blending to consonants
381
382 #define N_FRAME_POOL N_WCMDQ
383 static int ix = 0;
384 static frame_t frame_pool[N_FRAME_POOL];
385
386 ix++;
387 if (ix >= N_FRAME_POOL)
388 ix = 0;
389 return &frame_pool[ix];
390 }
391
set_frame_rms(frame_t * fr,int new_rms)392 static void set_frame_rms(frame_t *fr, int new_rms)
393 {
394 // Each frame includes its RMS amplitude value, so to set a new
395 // RMS just adjust the formant amplitudes by the appropriate ratio
396
397 int x;
398 int h;
399 int ix;
400
401 static const short sqrt_tab[200] = {
402 0, 64, 90, 110, 128, 143, 156, 169, 181, 192, 202, 212, 221, 230, 239, 247,
403 256, 263, 271, 278, 286, 293, 300, 306, 313, 320, 326, 332, 338, 344, 350, 356,
404 362, 367, 373, 378, 384, 389, 394, 399, 404, 409, 414, 419, 424, 429, 434, 438,
405 443, 448, 452, 457, 461, 465, 470, 474, 478, 483, 487, 491, 495, 499, 503, 507,
406 512, 515, 519, 523, 527, 531, 535, 539, 543, 546, 550, 554, 557, 561, 565, 568,
407 572, 576, 579, 583, 586, 590, 593, 596, 600, 603, 607, 610, 613, 617, 620, 623,
408 627, 630, 633, 636, 640, 643, 646, 649, 652, 655, 658, 662, 665, 668, 671, 674,
409 677, 680, 683, 686, 689, 692, 695, 698, 701, 704, 706, 709, 712, 715, 718, 721,
410 724, 726, 729, 732, 735, 738, 740, 743, 746, 749, 751, 754, 757, 759, 762, 765,
411 768, 770, 773, 775, 778, 781, 783, 786, 789, 791, 794, 796, 799, 801, 804, 807,
412 809, 812, 814, 817, 819, 822, 824, 827, 829, 832, 834, 836, 839, 841, 844, 846,
413 849, 851, 853, 856, 858, 861, 863, 865, 868, 870, 872, 875, 877, 879, 882, 884,
414 886, 889, 891, 893, 896, 898, 900, 902
415 };
416
417 if (voice->klattv[0]) {
418 if (new_rms == -1)
419 fr->klattp[KLATT_AV] = 50;
420 return;
421 }
422
423 if (fr->rms == 0) return; // check for divide by zero
424 x = (new_rms * 64)/fr->rms;
425 if (x >= 200) x = 199;
426
427 x = sqrt_tab[x]; // sqrt(new_rms/fr->rms)*0x200;
428
429 for (ix = 0; ix < 8; ix++) {
430 h = fr->fheight[ix] * x;
431 fr->fheight[ix] = h/0x200;
432 }
433 }
434
formants_reduce_hf(frame_t * fr,int level)435 static void formants_reduce_hf(frame_t *fr, int level)
436 {
437 // change height of peaks 2 to 8, percentage
438 int ix;
439 int x;
440
441 if (voice->klattv[0])
442 return;
443
444 for (ix = 2; ix < 8; ix++) {
445 x = fr->fheight[ix] * level;
446 fr->fheight[ix] = x/100;
447 }
448 }
449
CopyFrame(frame_t * frame1,int copy)450 static frame_t *CopyFrame(frame_t *frame1, int copy)
451 {
452 // create a copy of the specified frame in temporary buffer
453
454 frame_t *frame2;
455
456 if ((copy == 0) && (frame1->frflags & FRFLAG_COPIED)) {
457 // this frame has already been copied in temporary rw memory
458 return frame1;
459 }
460
461 frame2 = AllocFrame();
462 if (frame2 != NULL) {
463 memcpy(frame2, frame1, sizeof(frame_t));
464 frame2->length = 0;
465 frame2->frflags |= FRFLAG_COPIED;
466 }
467 return frame2;
468 }
469
DuplicateLastFrame(frameref_t * seq,int n_frames,int length)470 static frame_t *DuplicateLastFrame(frameref_t *seq, int n_frames, int length)
471 {
472 frame_t *fr;
473
474 seq[n_frames-1].length = length;
475 fr = CopyFrame(seq[n_frames-1].frame, 1);
476 seq[n_frames].frame = fr;
477 seq[n_frames].length = 0;
478 return fr;
479 }
480
AdjustFormants(frame_t * fr,int target,int min,int max,int f1_adj,int f3_adj,int hf_reduce,int flags)481 static void AdjustFormants(frame_t *fr, int target, int min, int max, int f1_adj, int f3_adj, int hf_reduce, int flags)
482 {
483 int x;
484
485 target = (target * voice->formant_factor)/256;
486
487 x = (target - fr->ffreq[2]) / 2;
488 if (x > max) x = max;
489 if (x < min) x = min;
490 fr->ffreq[2] += x;
491 fr->ffreq[3] += f3_adj;
492
493 if (flags & 0x20)
494 f3_adj = -f3_adj; // reverse direction for f4,f5 change
495 fr->ffreq[4] += f3_adj;
496 fr->ffreq[5] += f3_adj;
497
498 if (f1_adj == 1) {
499 x = (235 - fr->ffreq[1]);
500 if (x < -100) x = -100;
501 if (x > -60) x = -60;
502 fr->ffreq[1] += x;
503 }
504 if (f1_adj == 2) {
505 x = (235 - fr->ffreq[1]);
506 if (x < -300) x = -300;
507 if (x > -150) x = -150;
508 fr->ffreq[1] += x;
509 fr->ffreq[0] += x;
510 }
511 if (f1_adj == 3) {
512 x = (100 - fr->ffreq[1]);
513 if (x < -400) x = -400;
514 if (x > -300) x = -400;
515 fr->ffreq[1] += x;
516 fr->ffreq[0] += x;
517 }
518 formants_reduce_hf(fr, hf_reduce);
519 }
520
VowelCloseness(frame_t * fr)521 static int VowelCloseness(frame_t *fr)
522 {
523 // return a value 0-3 depending on the vowel's f1
524 int f1;
525
526 if ((f1 = fr->ffreq[1]) < 300)
527 return 3;
528 if (f1 < 400)
529 return 2;
530 if (f1 < 500)
531 return 1;
532 return 0;
533 }
534
FormantTransition2(frameref_t * seq,int * n_frames,unsigned int data1,unsigned int data2,PHONEME_TAB * other_ph,int which)535 int FormantTransition2(frameref_t *seq, int *n_frames, unsigned int data1, unsigned int data2, PHONEME_TAB *other_ph, int which)
536 {
537 int ix;
538 int formant;
539 int next_rms;
540
541 int len;
542 int rms;
543 int f1;
544 int f2;
545 int f2_min;
546 int f2_max;
547 int f3_adj;
548 int f3_amp;
549 int flags;
550 int vcolour;
551
552 #define N_VCOLOUR 2
553 // percentage change for each formant in 256ths
554 static short vcolouring[N_VCOLOUR][5] = {
555 { 243, 272, 256, 256, 256 }, // palatal consonant follows
556 { 256, 256, 240, 240, 240 }, // retroflex
557 };
558
559 frame_t *fr = NULL;
560
561 if (*n_frames < 2)
562 return 0;
563
564 len = (data1 & 0x3f) * 2;
565 rms = (data1 >> 6) & 0x3f;
566 flags = (data1 >> 12);
567
568 f2 = (data2 & 0x3f) * 50;
569 f2_min = (((data2 >> 6) & 0x1f) - 15) * 50;
570 f2_max = (((data2 >> 11) & 0x1f) - 15) * 50;
571 f3_adj = (((data2 >> 16) & 0x1f) - 15) * 50;
572 f3_amp = ((data2 >> 21) & 0x1f) * 8;
573 f1 = ((data2 >> 26) & 0x7);
574 vcolour = (data2 >> 29);
575
576 if ((other_ph != NULL) && (other_ph->mnemonic == '?'))
577 flags |= 8;
578
579 if (which == 1) {
580 // entry to vowel
581 fr = CopyFrame(seq[0].frame, 0);
582 seq[0].frame = fr;
583 seq[0].length = VOWEL_FRONT_LENGTH;
584 if (len > 0)
585 seq[0].length = len;
586 seq[0].frflags |= FRFLAG_LEN_MOD2; // reduce length modification
587 fr->frflags |= FRFLAG_LEN_MOD2;
588
589 next_rms = seq[1].frame->rms;
590
591 if (voice->klattv[0])
592 fr->klattp[KLATT_AV] = seq[1].frame->klattp[KLATT_AV] - 4;
593 if (f2 != 0) {
594 if (rms & 0x20)
595 set_frame_rms(fr, (next_rms * (rms & 0x1f))/30);
596 AdjustFormants(fr, f2, f2_min, f2_max, f1, f3_adj, f3_amp, flags);
597
598 if ((rms & 0x20) == 0)
599 set_frame_rms(fr, rms*2);
600 } else {
601 if (flags & 8)
602 set_frame_rms(fr, (next_rms*24)/32);
603 else
604 set_frame_rms(fr, RMS_START);
605 }
606
607 if (flags & 8)
608 modn_flags = 0x800 + (VowelCloseness(fr) << 8);
609 } else {
610 // exit from vowel
611 rms = rms*2;
612 if ((f2 != 0) || (flags != 0)) {
613
614 if (flags & 8) {
615 fr = CopyFrame(seq[*n_frames-1].frame, 0);
616 seq[*n_frames-1].frame = fr;
617 rms = RMS_GLOTTAL1;
618
619 // degree of glottal-stop effect depends on closeness of vowel (indicated by f1 freq)
620 modn_flags = 0x400 + (VowelCloseness(fr) << 8);
621 } else {
622 fr = DuplicateLastFrame(seq, (*n_frames)++, len);
623 if (len > 36)
624 seq_len_adjust += (len - 36);
625
626 if (f2 != 0)
627 AdjustFormants(fr, f2, f2_min, f2_max, f1, f3_adj, f3_amp, flags);
628 }
629
630 set_frame_rms(fr, rms);
631
632 if ((vcolour > 0) && (vcolour <= N_VCOLOUR)) {
633 for (ix = 0; ix < *n_frames; ix++) {
634 fr = CopyFrame(seq[ix].frame, 0);
635 seq[ix].frame = fr;
636
637 for (formant = 1; formant <= 5; formant++) {
638 int x;
639 x = fr->ffreq[formant] * vcolouring[vcolour-1][formant-1];
640 fr->ffreq[formant] = x / 256;
641 }
642 }
643 }
644 }
645 }
646
647 if (fr != NULL) {
648 if (flags & 4)
649 fr->frflags |= FRFLAG_FORMANT_RATE;
650 if (flags & 2)
651 fr->frflags |= FRFLAG_BREAK; // don't merge with next frame
652 }
653
654 if (flags & 0x40)
655 DoPause(20, 0); // add a short pause after the consonant
656
657 if (flags & 16)
658 return len;
659 return 0;
660 }
661
SmoothSpect(void)662 static void SmoothSpect(void)
663 {
664 // Limit the rate of frequence change of formants, to reduce chirping
665
666 intptr_t *q;
667 frame_t *frame;
668 frame_t *frame2;
669 frame_t *frame1;
670 frame_t *frame_centre;
671 int ix;
672 int len;
673 int pk;
674 int modified;
675 int allowed;
676 int diff;
677
678 if (syllable_start == syllable_end)
679 return;
680
681 if ((syllable_centre < 0) || (syllable_centre == syllable_start)) {
682 syllable_start = syllable_end;
683 return;
684 }
685
686 q = wcmdq[syllable_centre];
687 frame_centre = (frame_t *)q[2];
688
689 // backwards
690 ix = syllable_centre -1;
691 frame = frame2 = frame_centre;
692 for (;;) {
693 if (ix < 0) ix = N_WCMDQ-1;
694 q = wcmdq[ix];
695
696 if (q[0] == WCMD_PAUSE || q[0] == WCMD_WAVE)
697 break;
698
699 if (q[0] <= WCMD_SPECT2) {
700 len = q[1] & 0xffff;
701
702 frame1 = (frame_t *)q[3];
703 if (frame1 == frame) {
704 q[3] = (intptr_t)frame2;
705 frame1 = frame2;
706 } else
707 break; // doesn't follow on from previous frame
708
709 frame = frame2 = (frame_t *)q[2];
710 modified = 0;
711
712 if (frame->frflags & FRFLAG_BREAK)
713 break;
714
715 if (frame->frflags & FRFLAG_FORMANT_RATE)
716 len = (len * 12)/10; // allow slightly greater rate of change for this frame (was 12/10)
717
718 for (pk = 0; pk < 6; pk++) {
719 int f1, f2;
720
721 if ((frame->frflags & FRFLAG_BREAK_LF) && (pk < 3))
722 continue;
723
724 f1 = frame1->ffreq[pk];
725 f2 = frame->ffreq[pk];
726
727 // backwards
728 if ((diff = f2 - f1) > 0)
729 allowed = f1*2 + f2;
730 else
731 allowed = f1 + f2*2;
732
733 // the allowed change is specified as percentage (%*10) of the frequency
734 // take "frequency" as 1/3 from the lower freq
735 allowed = (allowed * formant_rate[pk])/3000;
736 allowed = (allowed * len)/256;
737
738 if (diff > allowed) {
739 if (modified == 0) {
740 frame2 = CopyFrame(frame, 0);
741 modified = 1;
742 }
743 frame2->ffreq[pk] = frame1->ffreq[pk] + allowed;
744 q[2] = (intptr_t)frame2;
745 } else if (diff < -allowed) {
746 if (modified == 0) {
747 frame2 = CopyFrame(frame, 0);
748 modified = 1;
749 }
750 frame2->ffreq[pk] = frame1->ffreq[pk] - allowed;
751 q[2] = (intptr_t)frame2;
752 }
753 }
754 }
755
756 if (ix == syllable_start)
757 break;
758 ix--;
759 }
760
761 // forwards
762 ix = syllable_centre;
763
764 frame = NULL;
765 for (;;) {
766 q = wcmdq[ix];
767
768 if (q[0] == WCMD_PAUSE || q[0] == WCMD_WAVE)
769 break;
770
771 if (q[0] <= WCMD_SPECT2) {
772 len = q[1] & 0xffff;
773
774 frame1 = (frame_t *)q[2];
775 if (frame != NULL) {
776 if (frame1 == frame) {
777 q[2] = (intptr_t)frame2;
778 frame1 = frame2;
779 } else
780 break; // doesn't follow on from previous frame
781 }
782
783 frame = frame2 = (frame_t *)q[3];
784 modified = 0;
785
786 if (frame1->frflags & FRFLAG_BREAK)
787 break;
788
789 if (frame1->frflags & FRFLAG_FORMANT_RATE)
790 len = (len *6)/5; // allow slightly greater rate of change for this frame
791
792 for (pk = 0; pk < 6; pk++) {
793 int f1, f2;
794 f1 = frame1->ffreq[pk];
795 f2 = frame->ffreq[pk];
796
797 // forwards
798 if ((diff = f2 - f1) > 0)
799 allowed = f1*2 + f2;
800 else
801 allowed = f1 + f2*2;
802 allowed = (allowed * formant_rate[pk])/3000;
803 allowed = (allowed * len)/256;
804
805 if (diff > allowed) {
806 if (modified == 0) {
807 frame2 = CopyFrame(frame, 0);
808 modified = 1;
809 }
810 frame2->ffreq[pk] = frame1->ffreq[pk] + allowed;
811 q[3] = (intptr_t)frame2;
812 } else if (diff < -allowed) {
813 if (modified == 0) {
814 frame2 = CopyFrame(frame, 0);
815 modified = 1;
816 }
817 frame2->ffreq[pk] = frame1->ffreq[pk] - allowed;
818 q[3] = (intptr_t)frame2;
819 }
820 }
821 }
822
823 ix++;
824 if (ix >= N_WCMDQ) ix = 0;
825 if (ix == syllable_end)
826 break;
827 }
828
829 syllable_start = syllable_end;
830 }
831
StartSyllable(void)832 static void StartSyllable(void)
833 {
834 // start of syllable, if not already started
835 if (syllable_end == syllable_start)
836 syllable_end = wcmdq_tail;
837 }
838
DoSpect2(PHONEME_TAB * this_ph,int which,FMT_PARAMS * fmt_params,PHONEME_LIST * plist,int modulation)839 int DoSpect2(PHONEME_TAB *this_ph, int which, FMT_PARAMS *fmt_params, PHONEME_LIST *plist, int modulation)
840 {
841 // which: 0 not a vowel, 1 start of vowel, 2 body and end of vowel
842 // length_mod: 256 = 100%
843 // modulation: -1 = don't write to wcmdq
844
845 int n_frames;
846 frameref_t *frames;
847 int frameix;
848 frame_t *frame1;
849 frame_t *frame2;
850 frame_t *fr;
851 int ix;
852 intptr_t *q;
853 int len;
854 int frame_length;
855 int length_factor;
856 int length_mod;
857 int length_sum;
858 int length_min;
859 int total_len = 0;
860 static int wave_flag = 0;
861 int wcmd_spect = WCMD_SPECT;
862 int frame_lengths[N_SEQ_FRAMES];
863
864 if (fmt_params->fmt_addr == 0)
865 return 0;
866
867 length_mod = plist->length;
868 if (length_mod == 0) length_mod = 256;
869
870 length_min = (samplerate/70); // greater than one cycle at low pitch (Hz)
871 if (which == 2) {
872 if ((translator->langopts.param[LOPT_LONG_VOWEL_THRESHOLD] > 0) && ((this_ph->std_length >= translator->langopts.param[LOPT_LONG_VOWEL_THRESHOLD]) || (plist->synthflags & SFLAG_LENGTHEN) || (this_ph->phflags & phLONG)))
873 length_min *= 2; // ensure long vowels are longer
874 }
875
876 if (which == 1) {
877 // limit the shortening of sonorants before shortened (eg. unstressed vowels)
878 if ((this_ph->type == phLIQUID) || (plist[-1].type == phLIQUID) || (plist[-1].type == phNASAL)) {
879 if (length_mod < (len = translator->langopts.param[LOPT_SONORANT_MIN]))
880 length_mod = len;
881 }
882 }
883
884 modn_flags = 0;
885 frames = LookupSpect(this_ph, which, fmt_params, &n_frames, plist);
886 if (frames == NULL)
887 return 0; // not found
888
889 if (fmt_params->fmt_amp != fmt_amplitude) {
890 // an amplitude adjustment is specified for this sequence
891 q = wcmdq[wcmdq_tail];
892 q[0] = WCMD_FMT_AMPLITUDE;
893 q[1] = fmt_amplitude = fmt_params->fmt_amp;
894 WcmdqInc();
895 }
896
897 frame1 = frames[0].frame;
898 if (voice->klattv[0])
899 wcmd_spect = WCMD_KLATT;
900
901 wavefile_ix = fmt_params->wav_addr;
902
903 if (fmt_params->wav_amp == 0)
904 wavefile_amp = 32;
905 else
906 wavefile_amp = (fmt_params->wav_amp * 32)/100;
907
908 if (wavefile_ix == 0) {
909 if (wave_flag) {
910 // cancel any wavefile that was playing previously
911 wcmd_spect = WCMD_SPECT2;
912 if (voice->klattv[0])
913 wcmd_spect = WCMD_KLATT2;
914 wave_flag = 0;
915 } else {
916 wcmd_spect = WCMD_SPECT;
917 if (voice->klattv[0])
918 wcmd_spect = WCMD_KLATT;
919 }
920 }
921
922 if (last_frame != NULL) {
923 if (((last_frame->length < 2) || (last_frame->frflags & FRFLAG_VOWEL_CENTRE))
924 && !(last_frame->frflags & FRFLAG_BREAK)) {
925 // last frame of previous sequence was zero-length, replace with first of this sequence
926 wcmdq[last_wcmdq][3] = (intptr_t)frame1;
927
928 if (last_frame->frflags & FRFLAG_BREAK_LF) {
929 // but flag indicates keep HF peaks in last segment
930 fr = CopyFrame(frame1, 1);
931 for (ix = 3; ix < 8; ix++) {
932 if (ix < 7)
933 fr->ffreq[ix] = last_frame->ffreq[ix];
934 fr->fheight[ix] = last_frame->fheight[ix];
935 }
936 wcmdq[last_wcmdq][3] = (intptr_t)fr;
937 }
938 }
939 }
940
941 if ((this_ph->type == phVOWEL) && (which == 2)) {
942 SmoothSpect(); // process previous syllable
943
944 // remember the point in the output queue of the centre of the vowel
945 syllable_centre = wcmdq_tail;
946 }
947
948 length_sum = 0;
949 for (frameix = 1; frameix < n_frames; frameix++) {
950 length_factor = length_mod;
951 if (frames[frameix-1].frflags & FRFLAG_LEN_MOD) // reduce effect of length mod
952 length_factor = (length_mod*(256-speed.lenmod_factor) + 256*speed.lenmod_factor)/256;
953 else if (frames[frameix-1].frflags & FRFLAG_LEN_MOD2) // reduce effect of length mod, used for the start of a vowel
954 length_factor = (length_mod*(256-speed.lenmod2_factor) + 256*speed.lenmod2_factor)/256;
955
956 frame_length = frames[frameix-1].length;
957 len = (frame_length * samplerate)/1000;
958 len = (len * length_factor)/256;
959 length_sum += len;
960 frame_lengths[frameix] = len;
961 }
962
963 if ((length_sum > 0) && (length_sum < length_min)) {
964 // lengthen, so that the sequence is greater than one cycle at low pitch
965 for (frameix = 1; frameix < n_frames; frameix++)
966 frame_lengths[frameix] = (frame_lengths[frameix] * length_min) / length_sum;
967 }
968
969 for (frameix = 1; frameix < n_frames; frameix++) {
970 frame2 = frames[frameix].frame;
971
972 if ((fmt_params->wav_addr != 0) && ((frame1->frflags & FRFLAG_DEFER_WAV) == 0)) {
973 // there is a wave file to play along with this synthesis
974 seq_len_adjust = 0;
975 DoSample2(fmt_params->wav_addr, which+0x100, 0, fmt_params->fmt_control, 0, wavefile_amp);
976 wave_flag = 1;
977 wavefile_ix = 0;
978 fmt_params->wav_addr = 0;
979 }
980
981 if (modulation >= 0) {
982 if (frame1->frflags & FRFLAG_MODULATE)
983 modulation = 6;
984 if ((frameix == n_frames-1) && (modn_flags & 0xf00))
985 modulation |= modn_flags; // before or after a glottal stop
986 }
987
988 len = frame_lengths[frameix];
989 pitch_length += len;
990 amp_length += len;
991
992 if (len == 0) {
993 last_frame = NULL;
994 frame1 = frame2;
995 } else {
996 last_wcmdq = wcmdq_tail;
997
998 if (modulation >= 0) {
999 q = wcmdq[wcmdq_tail];
1000 q[0] = wcmd_spect;
1001 q[1] = len + (modulation << 16);
1002 q[2] = (intptr_t)frame1;
1003 q[3] = (intptr_t)frame2;
1004
1005 WcmdqInc();
1006 }
1007 last_frame = frame1 = frame2;
1008 total_len += len;
1009 }
1010 }
1011
1012 if ((which != 1) && (fmt_amplitude != 0)) {
1013 q = wcmdq[wcmdq_tail];
1014 q[0] = WCMD_FMT_AMPLITUDE;
1015 q[1] = fmt_amplitude = 0;
1016 WcmdqInc();
1017 }
1018
1019 return total_len;
1020 }
1021
DoMarker(int type,int char_posn,int length,int value)1022 void DoMarker(int type, int char_posn, int length, int value)
1023 {
1024 // This could be used to return an index to the word currently being spoken
1025 // Type 1=word, 2=sentence, 3=named marker, 4=play audio, 5=end
1026
1027 if (WcmdqFree() > 5) {
1028 wcmdq[wcmdq_tail][0] = WCMD_MARKER + (type << 8);
1029 wcmdq[wcmdq_tail][1] = (char_posn & 0xffffff) | (length << 24);
1030 wcmdq[wcmdq_tail][2] = value;
1031 WcmdqInc();
1032 }
1033 }
1034
DoPhonemeMarker(int type,int char_posn,int length,char * name)1035 void DoPhonemeMarker(int type, int char_posn, int length, char *name)
1036 {
1037 // This could be used to return an index to the word currently being spoken
1038 // Type 7=phoneme
1039
1040 int *p;
1041
1042 if (WcmdqFree() > 5) {
1043 wcmdq[wcmdq_tail][0] = WCMD_MARKER + (type << 8);
1044 wcmdq[wcmdq_tail][1] = (char_posn & 0xffffff) | (length << 24);
1045 p = (int *)name;
1046 wcmdq[wcmdq_tail][2] = p[0]; // up to 8 bytes of UTF8 characters
1047 wcmdq[wcmdq_tail][3] = p[1];
1048 WcmdqInc();
1049 }
1050 }
1051
1052 #if HAVE_SONIC_H
DoSonicSpeed(int value)1053 void DoSonicSpeed(int value)
1054 {
1055 // value, multiplier * 1024
1056 wcmdq[wcmdq_tail][0] = WCMD_SONIC_SPEED;
1057 wcmdq[wcmdq_tail][1] = value;
1058 WcmdqInc();
1059 }
1060 #endif
1061
DoVoiceChange(voice_t * v)1062 espeak_ng_STATUS DoVoiceChange(voice_t *v)
1063 {
1064 // allocate memory for a copy of the voice data, and free it in wavegenfill()
1065 voice_t *v2;
1066 if ((v2 = (voice_t *)malloc(sizeof(voice_t))) == NULL)
1067 return static_cast<espeak_ng_STATUS> (ENOMEM);
1068 memcpy(v2, v, sizeof(voice_t));
1069 wcmdq[wcmdq_tail][0] = WCMD_VOICE;
1070 wcmdq[wcmdq_tail][2] = (intptr_t)v2;
1071 WcmdqInc();
1072 return static_cast<espeak_ng_STATUS> (ENS_OK);
1073 }
1074
DoEmbedded(int * embix,int sourceix)1075 void DoEmbedded(int *embix, int sourceix)
1076 {
1077 // There were embedded commands in the text at this point
1078 unsigned int word; // bit 7=last command for this word, bits 5,6 sign, bits 0-4 command
1079 unsigned int value;
1080 int command;
1081
1082 do {
1083 word = embedded_list[*embix];
1084 value = word >> 8;
1085 command = word & 0x7f;
1086
1087 if (command == 0)
1088 return; // error
1089
1090 (*embix)++;
1091
1092 switch (command & 0x1f)
1093 {
1094 case EMBED_S: // speed
1095 SetEmbedded((command & 0x60) + EMBED_S2, value); // adjusts embedded_value[EMBED_S2]
1096 SetSpeed(2);
1097 break;
1098 case EMBED_I: // play dynamically loaded wav data (sound icon)
1099 if ((int)value < n_soundicon_tab) {
1100 if (soundicon_tab[value].length != 0) {
1101 DoPause(10, 0); // ensure a break in the speech
1102 wcmdq[wcmdq_tail][0] = WCMD_WAVE;
1103 wcmdq[wcmdq_tail][1] = soundicon_tab[value].length;
1104 wcmdq[wcmdq_tail][2] = (intptr_t)soundicon_tab[value].data + 44; // skip WAV header
1105 wcmdq[wcmdq_tail][3] = 0x1500; // 16 bit data, amp=21
1106 WcmdqInc();
1107 }
1108 }
1109 break;
1110 case EMBED_M: // named marker
1111 DoMarker(espeakEVENT_MARK, (sourceix & 0x7ff) + clause_start_char, 0, value);
1112 break;
1113 case EMBED_U: // play sound
1114 DoMarker(espeakEVENT_PLAY, count_characters+1, 0, value); // always occurs at end of clause
1115 break;
1116 default:
1117 DoPause(10, 0); // ensure a break in the speech
1118 wcmdq[wcmdq_tail][0] = WCMD_EMBEDDED;
1119 wcmdq[wcmdq_tail][1] = command;
1120 wcmdq[wcmdq_tail][2] = value;
1121 WcmdqInc();
1122 break;
1123 }
1124 } while ((word & 0x80) == 0);
1125 }
1126
Generate(PHONEME_LIST * phonemelist,int * n_ph,bool resume)1127 int Generate(PHONEME_LIST *phonemelist, int *n_ph, bool resume)
1128 {
1129 static int ix;
1130 static int embedded_ix;
1131 static int word_count;
1132 PHONEME_LIST *prev;
1133 PHONEME_LIST *next;
1134 PHONEME_LIST *next2;
1135 PHONEME_LIST *p;
1136 bool released;
1137 int stress;
1138 int modulation;
1139 bool pre_voiced;
1140 int free_min;
1141 int value;
1142 unsigned char *pitch_env = NULL;
1143 unsigned char *amp_env;
1144 PHONEME_TAB *ph;
1145 int use_ipa = 0;
1146 int done_phoneme_marker;
1147 int vowelstart_prev;
1148 char phoneme_name[16];
1149 static int sourceix = 0;
1150
1151 PHONEME_DATA phdata;
1152 PHONEME_DATA phdata_prev;
1153 PHONEME_DATA phdata_next;
1154 PHONEME_DATA phdata_tone;
1155 FMT_PARAMS fmtp;
1156 static WORD_PH_DATA worddata;
1157
1158 if (option_phoneme_events & espeakINITIALIZE_PHONEME_IPA)
1159 use_ipa = 1;
1160
1161 if (mbrola_name[0] != 0)
1162 return MbrolaGenerate(phonemelist, n_ph, resume);
1163
1164 if (resume == false) {
1165 ix = 1;
1166 embedded_ix = 0;
1167 word_count = 0;
1168 pitch_length = 0;
1169 amp_length = 0;
1170 last_frame = NULL;
1171 last_wcmdq = -1;
1172 syllable_start = wcmdq_tail;
1173 syllable_end = wcmdq_tail;
1174 syllable_centre = -1;
1175 last_pitch_cmd = -1;
1176 memset(vowel_transition, 0, sizeof(vowel_transition));
1177 memset(&worddata, 0, sizeof(worddata));
1178 DoPause(0, 0); // isolate from the previous clause
1179 }
1180
1181 while ((ix < (*n_ph)) && (ix < N_PHONEME_LIST-2)) {
1182 p = &phonemelist[ix];
1183
1184 if (p->type == phPAUSE)
1185 free_min = 10;
1186 else if (p->type != phVOWEL)
1187 free_min = 15; // we need less Q space for non-vowels, and we need to generate phonemes after a vowel so that the pitch_length is filled in
1188 else
1189 free_min = MIN_WCMDQ;
1190
1191 if (WcmdqFree() <= free_min)
1192 return 1; // wait
1193
1194 prev = &phonemelist[ix-1];
1195 next = &phonemelist[ix+1];
1196 next2 = &phonemelist[ix+2];
1197
1198 if (p->synthflags & SFLAG_EMBEDDED)
1199 DoEmbedded(&embedded_ix, p->sourceix);
1200
1201 if (p->newword) {
1202 if (((p->type == phVOWEL) && (translator->langopts.param[LOPT_WORD_MERGE] & 1)) ||
1203 (p->ph->phflags & phNOPAUSE)) {
1204 } else
1205 last_frame = NULL;
1206
1207 sourceix = (p->sourceix & 0x7ff) + clause_start_char;
1208
1209 if (p->newword & 4)
1210 DoMarker(espeakEVENT_SENTENCE, sourceix, 0, count_sentences); // start of sentence
1211
1212 if (p->newword & 1)
1213 DoMarker(espeakEVENT_WORD, sourceix, p->sourceix >> 11, clause_start_word + word_count++); // NOTE, this count doesn't include multiple-word pronunciations in *_list. eg (of a)
1214 }
1215
1216 EndAmplitude();
1217
1218 if ((p->prepause > 0) && !(p->ph->phflags & phPREVOICE))
1219 DoPause(p->prepause, 1);
1220
1221 done_phoneme_marker = 0;
1222 if (option_phoneme_events && (p->ph->code != phonEND_WORD)) {
1223 if ((p->type == phVOWEL) && (prev->type == phLIQUID || prev->type == phNASAL)) {
1224 // For vowels following a liquid or nasal, do the phoneme event after the vowel-start
1225 } else {
1226 WritePhMnemonic(phoneme_name, p->ph, p, use_ipa, NULL);
1227 DoPhonemeMarker(espeakEVENT_PHONEME, sourceix, 0, phoneme_name);
1228 done_phoneme_marker = 1;
1229 }
1230 }
1231
1232 switch (p->type)
1233 {
1234 case phPAUSE:
1235 DoPause(p->length, 0);
1236 p->std_length = p->ph->std_length;
1237 break;
1238 case phSTOP:
1239 released = false;
1240 ph = p->ph;
1241 if (next->type == phVOWEL)
1242 released = true;
1243 else if (!next->newword) {
1244 if (next->type == phLIQUID) released = true;
1245 }
1246 if (released == false)
1247 p->synthflags |= SFLAG_NEXT_PAUSE;
1248
1249 if (ph->phflags & phPREVOICE) {
1250 // a period of voicing before the release
1251 memset(&fmtp, 0, sizeof(fmtp));
1252 InterpretPhoneme(NULL, 0x01, p, &phdata, &worddata);
1253 fmtp.fmt_addr = phdata.sound_addr[pd_FMT];
1254 fmtp.fmt_amp = phdata.sound_param[pd_FMT];
1255
1256 if (last_pitch_cmd < 0) {
1257 DoAmplitude(next->amp, NULL);
1258 DoPitch(envelope_data[p->env], next->pitch1, next->pitch2);
1259 }
1260
1261 DoSpect2(ph, 0, &fmtp, p, 0);
1262 }
1263
1264 InterpretPhoneme(NULL, 0, p, &phdata, &worddata);
1265 phdata.pd_control |= pd_DONTLENGTHEN;
1266 DoSample3(&phdata, 0, 0);
1267 break;
1268 case phFRICATIVE:
1269 InterpretPhoneme(NULL, 0, p, &phdata, &worddata);
1270
1271 if (p->synthflags & SFLAG_LENGTHEN)
1272 DoSample3(&phdata, p->length, 0); // play it twice for [s:] etc.
1273 DoSample3(&phdata, p->length, 0);
1274 break;
1275 case phVSTOP:
1276 ph = p->ph;
1277 memset(&fmtp, 0, sizeof(fmtp));
1278 fmtp.fmt_control = pd_DONTLENGTHEN;
1279
1280 pre_voiced = false;
1281 if (next->type == phVOWEL) {
1282 DoAmplitude(p->amp, NULL);
1283 DoPitch(envelope_data[p->env], p->pitch1, p->pitch2);
1284 pre_voiced = true;
1285 } else if ((next->type == phLIQUID) && !next->newword) {
1286 DoAmplitude(next->amp, NULL);
1287 DoPitch(envelope_data[next->env], next->pitch1, next->pitch2);
1288 pre_voiced = true;
1289 } else {
1290 if (last_pitch_cmd < 0) {
1291 DoAmplitude(next->amp, NULL);
1292 DoPitch(envelope_data[p->env], p->pitch1, p->pitch2);
1293 }
1294 }
1295
1296 if ((prev->type == phVOWEL) || (ph->phflags & phPREVOICE)) {
1297 // a period of voicing before the release
1298 InterpretPhoneme(NULL, 0x01, p, &phdata, &worddata);
1299 fmtp.fmt_addr = phdata.sound_addr[pd_FMT];
1300 fmtp.fmt_amp = phdata.sound_param[pd_FMT];
1301
1302 DoSpect2(ph, 0, &fmtp, p, 0);
1303 if (p->synthflags & SFLAG_LENGTHEN) {
1304 DoPause(25, 1);
1305 DoSpect2(ph, 0, &fmtp, p, 0);
1306 }
1307 } else {
1308 if (p->synthflags & SFLAG_LENGTHEN)
1309 DoPause(50, 0);
1310 }
1311
1312 if (pre_voiced) {
1313 // followed by a vowel, or liquid + vowel
1314 StartSyllable();
1315 } else
1316 p->synthflags |= SFLAG_NEXT_PAUSE;
1317 InterpretPhoneme(NULL, 0, p, &phdata, &worddata);
1318 fmtp.fmt_addr = phdata.sound_addr[pd_FMT];
1319 fmtp.fmt_amp = phdata.sound_param[pd_FMT];
1320 fmtp.wav_addr = phdata.sound_addr[pd_ADDWAV];
1321 fmtp.wav_amp = phdata.sound_param[pd_ADDWAV];
1322 DoSpect2(ph, 0, &fmtp, p, 0);
1323
1324 if ((p->newword == 0) && (next2->newword == 0)) {
1325 if (next->type == phVFRICATIVE)
1326 DoPause(20, 0);
1327 if (next->type == phFRICATIVE)
1328 DoPause(12, 0);
1329 }
1330 break;
1331 case phVFRICATIVE:
1332 if (next->type == phVOWEL) {
1333 DoAmplitude(p->amp, NULL);
1334 DoPitch(envelope_data[p->env], p->pitch1, p->pitch2);
1335 } else if (next->type == phLIQUID) {
1336 DoAmplitude(next->amp, NULL);
1337 DoPitch(envelope_data[next->env], next->pitch1, next->pitch2);
1338 } else {
1339 if (last_pitch_cmd < 0) {
1340 DoAmplitude(p->amp, NULL);
1341 DoPitch(envelope_data[p->env], p->pitch1, p->pitch2);
1342 }
1343 }
1344
1345 if ((next->type == phVOWEL) || ((next->type == phLIQUID) && (next->newword == 0))) // ?? test 14.Aug.2007
1346 StartSyllable();
1347 else
1348 p->synthflags |= SFLAG_NEXT_PAUSE;
1349 InterpretPhoneme(NULL, 0, p, &phdata, &worddata);
1350 memset(&fmtp, 0, sizeof(fmtp));
1351 fmtp.std_length = phdata.pd_param[i_SET_LENGTH]*2;
1352 fmtp.fmt_addr = phdata.sound_addr[pd_FMT];
1353 fmtp.fmt_amp = phdata.sound_param[pd_FMT];
1354 fmtp.wav_addr = phdata.sound_addr[pd_ADDWAV];
1355 fmtp.wav_amp = phdata.sound_param[pd_ADDWAV];
1356
1357 if (p->synthflags & SFLAG_LENGTHEN)
1358 DoSpect2(p->ph, 0, &fmtp, p, 0);
1359 DoSpect2(p->ph, 0, &fmtp, p, 0);
1360 break;
1361 case phNASAL:
1362 memset(&fmtp, 0, sizeof(fmtp));
1363 if (!(p->synthflags & SFLAG_SEQCONTINUE)) {
1364 DoAmplitude(p->amp, NULL);
1365 DoPitch(envelope_data[p->env], p->pitch1, p->pitch2);
1366 }
1367
1368 if (prev->type == phNASAL)
1369 last_frame = NULL;
1370
1371 InterpretPhoneme(NULL, 0, p, &phdata, &worddata);
1372 fmtp.std_length = phdata.pd_param[i_SET_LENGTH]*2;
1373 fmtp.fmt_addr = phdata.sound_addr[pd_FMT];
1374 fmtp.fmt_amp = phdata.sound_param[pd_FMT];
1375
1376 if (next->type == phVOWEL) {
1377 StartSyllable();
1378 DoSpect2(p->ph, 0, &fmtp, p, 0);
1379 } else if (prev->type == phVOWEL && (p->synthflags & SFLAG_SEQCONTINUE))
1380 DoSpect2(p->ph, 0, &fmtp, p, 0);
1381 else {
1382 last_frame = NULL; // only for nasal ?
1383 DoSpect2(p->ph, 0, &fmtp, p, 0);
1384 last_frame = NULL;
1385 }
1386
1387 break;
1388 case phLIQUID:
1389 memset(&fmtp, 0, sizeof(fmtp));
1390 modulation = 0;
1391 if (p->ph->phflags & phTRILL)
1392 modulation = 5;
1393
1394 if (!(p->synthflags & SFLAG_SEQCONTINUE)) {
1395 DoAmplitude(p->amp, NULL);
1396 DoPitch(envelope_data[p->env], p->pitch1, p->pitch2);
1397 }
1398
1399 if (prev->type == phNASAL)
1400 last_frame = NULL;
1401
1402 if (next->type == phVOWEL)
1403 StartSyllable();
1404 InterpretPhoneme(NULL, 0, p, &phdata, &worddata);
1405
1406 if ((value = (phdata.pd_param[i_PAUSE_BEFORE] - p->prepause)) > 0)
1407 DoPause(value, 1);
1408 fmtp.std_length = phdata.pd_param[i_SET_LENGTH]*2;
1409 fmtp.fmt_addr = phdata.sound_addr[pd_FMT];
1410 fmtp.fmt_amp = phdata.sound_param[pd_FMT];
1411 fmtp.wav_addr = phdata.sound_addr[pd_ADDWAV];
1412 fmtp.wav_amp = phdata.sound_param[pd_ADDWAV];
1413 DoSpect2(p->ph, 0, &fmtp, p, modulation);
1414 break;
1415 case phVOWEL:
1416 ph = p->ph;
1417 stress = p->stresslevel & 0xf;
1418
1419 memset(&fmtp, 0, sizeof(fmtp));
1420
1421 InterpretPhoneme(NULL, 0, p, &phdata, &worddata);
1422 fmtp.std_length = phdata.pd_param[i_SET_LENGTH] * 2;
1423 vowelstart_prev = 0;
1424
1425 if (((fmtp.fmt_addr = phdata.sound_addr[pd_VWLSTART]) != 0) && ((phdata.pd_control & pd_FORNEXTPH) == 0)) {
1426 // a vowel start has been specified by the Vowel program
1427 fmtp.fmt_length = phdata.sound_param[pd_VWLSTART];
1428 } else if (prev->type != phPAUSE) {
1429 // check the previous phoneme
1430 InterpretPhoneme(NULL, 0, prev, &phdata_prev, NULL);
1431 if (((fmtp.fmt_addr = phdata_prev.sound_addr[pd_VWLSTART]) != 0) && (phdata_prev.pd_control & pd_FORNEXTPH)) {
1432 // a vowel start has been specified by the previous phoneme
1433 vowelstart_prev = 1;
1434 fmtp.fmt2_lenadj = phdata_prev.sound_param[pd_VWLSTART];
1435 }
1436 fmtp.transition0 = phdata_prev.vowel_transition[0];
1437 fmtp.transition1 = phdata_prev.vowel_transition[1];
1438 }
1439
1440 if (fmtp.fmt_addr == 0) {
1441 // use the default start for this vowel
1442 fmtp.use_vowelin = 1;
1443 fmtp.fmt_control = 1;
1444 fmtp.fmt_addr = phdata.sound_addr[pd_FMT];
1445 }
1446
1447 fmtp.fmt_amp = phdata.sound_param[pd_FMT];
1448
1449 pitch_env = envelope_data[p->env];
1450 amp_env = NULL;
1451 if (p->tone_ph != 0) {
1452 InterpretPhoneme2(p->tone_ph, &phdata_tone);
1453 pitch_env = GetEnvelope(phdata_tone.pitch_env);
1454 if (phdata_tone.amp_env > 0)
1455 amp_env = GetEnvelope(phdata_tone.amp_env);
1456 }
1457
1458 StartSyllable();
1459
1460 modulation = 2;
1461 if (stress <= 1)
1462 modulation = 1; // 16ths
1463 else if (stress >= 7)
1464 modulation = 3;
1465
1466 if (prev->type == phVSTOP || prev->type == phVFRICATIVE) {
1467 DoAmplitude(p->amp, amp_env);
1468 DoPitch(pitch_env, p->pitch1, p->pitch2); // don't use prevocalic rising tone
1469 DoSpect2(ph, 1, &fmtp, p, modulation);
1470 } else if (prev->type == phLIQUID || prev->type == phNASAL) {
1471 DoAmplitude(p->amp, amp_env);
1472 DoSpect2(ph, 1, &fmtp, p, modulation); // continue with pre-vocalic rising tone
1473 DoPitch(pitch_env, p->pitch1, p->pitch2);
1474 } else if (vowelstart_prev) {
1475 // VowelStart from the previous phoneme, but not phLIQUID or phNASAL
1476 DoPitch(envelope_data[PITCHrise], p->pitch2 - 15, p->pitch2);
1477 DoAmplitude(p->amp-1, amp_env);
1478 DoSpect2(ph, 1, &fmtp, p, modulation); // continue with pre-vocalic rising tone
1479 DoPitch(pitch_env, p->pitch1, p->pitch2);
1480 } else {
1481 if (!(p->synthflags & SFLAG_SEQCONTINUE)) {
1482 DoAmplitude(p->amp, amp_env);
1483 DoPitch(pitch_env, p->pitch1, p->pitch2);
1484 }
1485
1486 DoSpect2(ph, 1, &fmtp, p, modulation);
1487 }
1488
1489 if ((option_phoneme_events) && (done_phoneme_marker == 0)) {
1490 WritePhMnemonic(phoneme_name, p->ph, p, use_ipa, NULL);
1491 DoPhonemeMarker(espeakEVENT_PHONEME, sourceix, 0, phoneme_name);
1492 }
1493
1494 fmtp.fmt_addr = phdata.sound_addr[pd_FMT];
1495 fmtp.fmt_amp = phdata.sound_param[pd_FMT];
1496 fmtp.transition0 = 0;
1497 fmtp.transition1 = 0;
1498
1499 if ((fmtp.fmt2_addr = phdata.sound_addr[pd_VWLEND]) != 0)
1500 fmtp.fmt2_lenadj = phdata.sound_param[pd_VWLEND];
1501 else if (next->type != phPAUSE) {
1502 fmtp.fmt2_lenadj = 0;
1503 InterpretPhoneme(NULL, 0, next, &phdata_next, NULL);
1504
1505 fmtp.use_vowelin = 1;
1506 fmtp.transition0 = phdata_next.vowel_transition[2]; // always do vowel_transition, even if ph_VWLEND ?? consider [N]
1507 fmtp.transition1 = phdata_next.vowel_transition[3];
1508
1509 if ((fmtp.fmt2_addr = phdata_next.sound_addr[pd_VWLEND]) != 0)
1510 fmtp.fmt2_lenadj = phdata_next.sound_param[pd_VWLEND];
1511 }
1512
1513 DoSpect2(ph, 2, &fmtp, p, modulation);
1514 break;
1515 }
1516 ix++;
1517 }
1518 EndPitch(1);
1519 if (*n_ph > 0) {
1520 DoMarker(espeakEVENT_END, count_characters, 0, count_sentences); // end of clause
1521 *n_ph = 0;
1522 }
1523
1524 return 0; // finished the phoneme list
1525 }
1526
SpeakNextClause(int control)1527 int SpeakNextClause(int control)
1528 {
1529 // Speak text from memory (text_in)
1530 // control 0: start
1531 // text_in is set
1532
1533 // The other calls have text_in = NULL
1534 // control 1: speak next text
1535 // 2: stop
1536
1537 int clause_tone;
1538 char *voice_change;
1539 const char *phon_out;
1540
1541 if (control == 2) {
1542 // stop speaking
1543 n_phoneme_list = 0;
1544 WcmdqStop();
1545
1546 return 0;
1547 }
1548
1549 if (text_decoder_eof(p_decoder)) {
1550 skipping_text = 0;
1551 return 0;
1552 }
1553
1554 if (current_phoneme_table != voice->phoneme_tab_ix)
1555 SelectPhonemeTable(voice->phoneme_tab_ix);
1556
1557 // read the next clause from the input text file, translate it, and generate
1558 // entries in the wavegen command queue
1559 TranslateClause(translator, &clause_tone, &voice_change);
1560
1561 CalcPitches(translator, clause_tone);
1562 CalcLengths(translator);
1563
1564 if ((option_phonemes & 0xf) || (phoneme_callback != NULL)) {
1565 phon_out = GetTranslatedPhonemeString(option_phonemes);
1566 if (option_phonemes & 0xf)
1567 fprintf(f_trans, "%s\n", phon_out);
1568 if (phoneme_callback != NULL)
1569 phoneme_callback(phon_out);
1570 }
1571
1572 if (skipping_text) {
1573 n_phoneme_list = 0;
1574 return 1;
1575 }
1576
1577 Generate(phoneme_list, &n_phoneme_list, 0);
1578
1579 if (voice_change != NULL) {
1580 // voice change at the end of the clause (i.e. clause was terminated by a voice change)
1581 new_voice = LoadVoiceVariant(voice_change, 0); // add a Voice instruction to wavegen at the end of the clause
1582 }
1583
1584 if (new_voice) {
1585 // finished the current clause, now change the voice if there was an embedded
1586 // change voice command at the end of it (i.e. clause was broken at the change voice command)
1587 DoVoiceChange(voice);
1588 new_voice = NULL;
1589 }
1590
1591 return 1;
1592 }
1593